## Step 1: Imports


In [98]:
import pandas as pd
import kagglehub
import math 
import numpy as np 

from sklearn import preprocessing, svm, model_selection
from sklearn.linear_model import LinearRegression

In [99]:
df = pd.read_csv('WIKI_PRICES.csv')

## Step 2: Understand The Data

In [100]:
print(df.shape[0])
print(df.shape[1])

15389314
14


In [101]:
df.isnull().sum()

ticker           0
date             0
open           538
high            55
low             55
close            1
volume           0
ex-dividend      0
split_ratio      1
adj_open       538
adj_high        55
adj_low         55
adj_close        1
adj_volume       0
dtype: int64

In [102]:
df.dtypes

ticker          object
date            object
open           float64
high           float64
low            float64
close          float64
volume         float64
ex-dividend    float64
split_ratio    float64
adj_open       float64
adj_high       float64
adj_low        float64
adj_close      float64
adj_volume     float64
dtype: object

In [103]:
df.describe()

Unnamed: 0,open,high,low,close,volume,ex-dividend,split_ratio,adj_open,adj_high,adj_low,adj_close,adj_volume
count,15388780.0,15389260.0,15389260.0,15389310.0,15389310.0,15389310.0,15389310.0,15388780.0,15389260.0,15389260.0,15389310.0,15389310.0
mean,76.52829,77.24188,75.77329,76.53463,1192647.0,0.002202309,1.000185,167.9133,170.6558,165.0796,167.8707,1412932.0
std,2511.524,2526.096,2494.656,2511.146,8406732.0,0.2145488,0.02078999,5231.34,5331.112,5130.088,5230.613,6535418.0
min,0.001,0.002,0.001,0.002,0.0,0.0,0.00175,0.0071091,0.007870789,0.0071091,0.007870789,0.0
25%,11.97,12.17,11.75,11.97,38500.0,0.0,1.0,6.130212,6.25,6.010335,6.130321,46800.0
50%,23.25,23.62,22.9,23.25,186646.0,0.0,1.0,13.87851,14.10171,13.63224,13.87932,232100.0
75%,39.45,39.95,38.92,39.46,717400.0,0.0,1.0,28.09588,28.51,27.66,28.0965,907200.0
max,325650.0,326350.0,323100.0,325915.0,6674913000.0,567.9717,50.0,870240.0,923280.0,854520.0,878520.0,2304019000.0


In [104]:
for column in df.columns:
    print(f"{column} : {len(df[column].unique())}")

ticker : 3199
date : 14277
open : 124876
high : 267980
low : 274874
close : 123164
volume : 1678767
ex-dividend : 2163
split_ratio : 125
adj_open : 7137032
adj_high : 7209700
adj_low : 7215091
adj_close : 7039021
adj_volume : 1978562


## Step 3: Clean And Prepare The Data

In [105]:
dataframe = df[['adj_open', 'adj_high', 'adj_low', 'adj_close', 'adj_volume']]

In [106]:
dataframe = dataframe.rename(columns={'adj_open':'open', 'adj_high':'high', 'adj_low':'low', 'adj_close':'close', 'adj_volume':'volume'})

In [107]:
dataframe['volatilityPercentage'] = (dataframe['open'] - dataframe['close']) / dataframe['close'] * 100.0
dataframe['percentageChange'] = (dataframe['close'] - dataframe['open']) / dataframe['open'] * 100.0

In [108]:
df = dataframe[['close', 'volatilityPercentage', 'percentageChange', 'volume']]

In [109]:
df.fillna(-99999, inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.fillna(-99999, inplace= True)


In [110]:
df.dropna(inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace= True)


In [111]:
df.head()

Unnamed: 0,close,volatilityPercentage,percentageChange,volume
0,30.01859,3.409091,-3.296703,44739900.0
1,27.548879,6.339772,-5.961807,10897100.0
2,30.01859,-6.113636,6.51174,4705200.0
3,27.460188,5.590062,-5.294118,4274400.0
4,28.012803,-2.264978,2.317468,3464400.0


## Machine Learning

In [112]:
forecastCol = 'close'
forecastOut = int(math.ceil(0.000001 * len(df)))

In [113]:
df['label'] = df[forecastCol].shift(-forecastOut)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df[forecastCol].shift(-forecastOut)


In [114]:
df.dropna(inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace= True)


In [115]:
x = np.array(df.drop(['label'], axis=1))
y = np.array(df['label'])

In [116]:
x = preprocessing.scale(x)
y = np.array(df['label'])

In [117]:
xTrain, xTest, yTrain, yTest = model_selection.train_test_split(x, y, test_size=0.2)

In [None]:
clf = svm.SVR()

clf.fit(xTrain, yTrain)

accuracy = clf.score(xTest, yTest)

print(accuracy)

In [None]:
clf = LinearRegression()

clf.fit(xTrain, yTrain)

accuracy = clf.score(xTest, yTest)

print(accuracy)


0.9743653570805323 16
