In [1]:
import pandas as pd
import numpy as np
import os.path
from os import listdir
from os.path import isfile, join
import os

# **Data Preparation:**
 ### The first steps to use this repo revolve around downloading the data and transforming it into a usable format


### 1. Download all of the data from the *US Financial News Articles* page on kaggle. You will need to log into your kaggle account to do this
>https://www.kaggle.com/jeet2016/us-financial-news-articles/download



### 2. Copy the filepath of your downloaded file into the argument of the *prepare_data(path)* function below and run it.
#### The function may take some time to run, as there are 306,242 articles in the dataset. When the function is done running, you will have a new file named "newsdf.csv".  This is a csv with all of the articles in a usable format for future pandas use. To avoid rerunning it once completed, you can comment out the line where the funciton runs.


In [6]:
def prepare_data(path):
    newsdf = pd.DataFrame()
    dfs = []
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            if filename.lower().endswith((".json")): 
                print(os.path.join(dirname, filename))
                data = pd.read_json(os.path.join(dirname, filename), lines=True)
                dfs.append(data)
    newsdf = pd.concat(dfs, ignore_index = True)
    newsdf.to_csv("newsdf.csv")

In [8]:
## Change "path" to the filepath of the file you downloaded
prepare_data('/Users/jrsyc1/Downloads/archive')

/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0047188.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0014010.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0042670.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0023363.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0008033.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0043962.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0036862.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0000403.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0056263.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0032088.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0037770.json

/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0050708.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0003690.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0004155.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0052735.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0038939.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0027635.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0025608.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0010546.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0046326.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0017283.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0026927.json

/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0055859.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0037265.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0048768.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0035258.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0000116.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0056776.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0012202.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0044462.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0039152.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0025171.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0020689.json

/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0040134.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0009858.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0014293.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0013556.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0045336.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0038606.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0039914.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0026618.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0024625.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0007145.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0051725.json

/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0046664.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0010004.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0044659.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0012039.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0039369.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0027377.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0047976.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0009225.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0038981.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0022175.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0020148.json

/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0022933.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0013297.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0006996.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0037232.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0018988.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0007684.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0056721.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0000141.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0002486.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0053523.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0005343.json

/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0029555.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0054065.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0002605.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0056058.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0000638.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0035576.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0003917.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0016216.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0023158.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0008208.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0021165.json

/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0017057.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0010792.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0039105.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0044435.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0012255.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0015590.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0046408.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0010268.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0025126.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0001183.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0050026.json

/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0029040.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0054570.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0002310.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0021470.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0024388.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0017811.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0044989.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0040163.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0016703.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0019399.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0037731.json

/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0027361.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0047960.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0046672.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0010012.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0049587.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0018422.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0033772.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0032860.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0028094.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0052261.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0026966.json

/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0032026.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0051508.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0007368.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0002490.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0053535.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0005355.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0007692.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0056737.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0000157.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0035219.json
/Users/jrsyc1/Downloads/archive/2018_03_112b52537b67659ad3609a234388c50a/news_0048729.json

KeyboardInterrupt: 