# Data Cleaning

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

plt.style.use('seaborn-darkgrid')

import warnings
warnings.filterwarnings("ignore")

In [None]:
#df = pd.read_json('data/elonDFV7.json')
df = pd.read_json('data/elonMLDFV7.json')

In [3]:
#2680 rows 
#financial data we will do a linear interpolation
#need to do train_test_split before linear interpolation 
#Tweet data we will just replace with 0
#DayOfweek switch to categorical and change days into numbers 0 = Monday -> 6 = Sunday
#Drop tweetType, tweet since we already did distilbert
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2712 entries, 0 to 2711
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               2712 non-null   datetime64[ns]
 1   retweet_count      1782 non-null   float64       
 2   fav_count          1782 non-null   float64       
 3   tweetLen           1782 non-null   float64       
 4   tweet              1782 non-null   object        
 5   tweetType          1782 non-null   object        
 6   Business positive  1782 non-null   float64       
 7   Business neutral   1782 non-null   float64       
 8   Business negative  1782 non-null   float64       
 9   Personal positive  1782 non-null   float64       
 10  Personal neutral   1782 non-null   float64       
 11  Personal negative  1782 non-null   float64       
 12  compound           1782 non-null   float64       
 13  High               2179 non-null   float64       
 14  Low     

In [4]:
df.columns

Index(['date', 'retweet_count', 'fav_count', 'tweetLen', 'tweet', 'tweetType',
       'Business positive', 'Business neutral', 'Business negative',
       'Personal positive', 'Personal neutral', 'Personal negative',
       'compound', 'High', 'Low', 'Open', 'Close', 'Volume', 'Adj Close',
       'dayOfWeek'],
      dtype='object')

In [5]:
#Change dayOfWeek to numerical and categorical
df['dayOfWeek'] = pd.Categorical(df['dayOfWeek'])
df['dayOfWeek'] = df['dayOfWeek'].cat.codes

# df1['dayOfWeek'] = pd.Categorical(df1['dayOfWeek'])
# df1['dayOfWeek'] = df1['dayOfWeek'].cat.codes

In [6]:
#Drop tweetType, tweet since we already did distilbert
df = df.drop(columns = ['tweetType', 'tweet'])
# df1 = df1.drop(columns = ['tweetType', 'tweet'])

In [8]:
#Going to use Adjusted Close as our Y variable 
df.corr()['Adj Close']
#df1.corr()['Adj Close']

retweet_count        0.255058
fav_count            0.417315
tweetLen             0.214357
Business positive    0.155887
Business neutral     0.125191
Business negative    0.072118
Personal positive    0.335091
Personal neutral     0.334035
Personal negative    0.281560
compound             0.148557
High                 0.996232
Low                  0.996889
Open                 0.995694
Close                0.997495
Volume               0.493952
Adj Close            1.000000
dayOfWeek            0.003611
Name: Adj Close, dtype: float64

In [None]:
#cor(i,j) = cov(i,j)/[stdev(i)*stdev(j)]
#If the values of the ith or jth variable do not vary, 
#then the respective standard deviation will be zero and 
#so will the denominator of the fraction. 
#Thus, the correlation will be NaN.

In [9]:
#Set the date as the index
df = df.set_index('date')
#df1 = df1.set_index('date')

In [11]:
#cleaning the dataframe for train test split for random forest and xgboost

#Change Financial NaNs using Linear
stockDataclean = ['Close', 'Open', 'High', 'Low', 'Volume', 'Adj Close']

#Set Tweet data to 0s 
zeroColclean = ['fav_count', 'retweet_count', 'Business positive',
           'Business neutral', 'Business negative', 'Personal positive',
           'Personal neutral', 'Personal negative',
           'compound', 'tweetLen']


df[stockDataclean] = df[stockDataclean].interpolate(method = 'linear', 
                                                    limit_direction="both")

df[zeroColclean] = df[zeroColclean].fillna(0)



In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2712 entries, 2011-12-01 to 2020-07-31
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   retweet_count      2712 non-null   float64
 1   fav_count          2712 non-null   float64
 2   tweetLen           2712 non-null   float64
 3   Business positive  2712 non-null   float64
 4   Business neutral   2712 non-null   float64
 5   Business negative  2712 non-null   float64
 6   Personal positive  2712 non-null   float64
 7   Personal neutral   2712 non-null   float64
 8   Personal negative  2712 non-null   float64
 9   compound           2712 non-null   float64
 10  High               2712 non-null   float64
 11  Low                2712 non-null   float64
 12  Open               2712 non-null   float64
 13  Close              2712 non-null   float64
 14  Volume             2712 non-null   float64
 15  Adj Close          2712 non-null   float64
 16  dayOfW

In [14]:
df.to_json('data/elonDFCleanV8.json')

In [None]:
#Do a train Test Split to avoid data leakage for our time series model 
X = df.loc[:, df.columns != 'Adj Close']
y = pd.DataFrame(df.loc[:, 'Adj Close'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.1, shuffle = False)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
#Change Financial NaNs using Linear
stockDatax = ['Close', 'Open', 'High', 'Low', 'Volume']
stockDatay = 'Adj Close' 

#Features
X_train[stockDatax] = X_train[stockDatax].interpolate(method = 'linear', 
                                                    limit_direction="both")
X_test[stockDatax] = X_test[stockDatax].interpolate(method = 'linear', 
                                                    limit_direction="both")
#Target
y_train[stockDatay] = y_train[stockDatay].interpolate(method = 'linear', 
                                                    limit_direction="both")
y_test[stockDatay] = y_test[stockDatay].interpolate(method = 'linear', 
                                                    limit_direction="both")

In [None]:
#Set Tweet data to 0s 
zeroCol = ['fav_count', 'retweet_count', 'Business positive',
           'Business neutral', 'Business negative', 'Personal positive',
           'Personal neutral', 'Personal negative',
           'compound', 'tweetLen']

#Features
X_train[zeroCol] = X_train[zeroCol].fillna(0)
X_test[zeroCol] = X_test[zeroCol].fillna(0)

In [None]:
#Check if no nulls
X_train.info(), y_train.info() 

In [None]:
#Check if no nulls
X_test.info(), y_test.info()

In [None]:
#Visualize just to see if interpolation didn't mess up 
plt.style.use('fivethirtyeight')

In [None]:
plt.plot(y_test['Adj Close'])
plt.xticks(rotation=45);

In [None]:
plt.plot(y_train['Adj Close'])
plt.xticks(rotation=45);

In [None]:
plt.plot(X_test['Volume'])
plt.xticks(rotation=45);

In [None]:
plt.plot(X_train['Volume'])
plt.xticks(rotation=45);

In [None]:
#Save Data For Model 
X_train.to_json('data/X_train.json')
X_test.to_json('data/X_test.json')
y_test.to_json('data/y_test.json')
y_train.to_json('data/y_train.json')

In [None]:
X_train.columns