In [305]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_palette('husl')
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit, KFold
from sklearn.utils import resample


from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [337]:
#Loading the input dataset
data = pd.read_csv('input_data_set_simple_price_range.csv')
data = data.dropna().reset_index(drop=True)

## Preview of Data

In [338]:
data.head()

Unnamed: 0.1,Unnamed: 0,symbol,date,Sales Change,Net Profit Margin Change,EPS Change,prev_qtr_sales_change,prev_qtr_npm_change,prev_qtr_eps_change,Price Change
0,0,3IINFOTECH,2015-06-30,-12.235698,-9.945304,8.854167,-12.508025,71.061079,74.415205,0.0
1,1,3IINFOTECH,2015-09-30,-22.356635,-43.559135,-4.945055,2.824997,-7.205874,-9.142857,1.0
2,2,3IINFOTECH,2015-12-31,170.689981,93.524927,83.163265,240.61608,79.254084,30.890052,0.0
3,3,3IINFOTECH,2016-03-31,190.766103,30.327574,-235.672515,-5.11191,-982.498732,-1639.393939,1.0
4,4,3IINFOTECH,2016-06-30,212.009294,106.364604,117.714286,-6.115915,102.643582,101.350174,1.0


In [339]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10201 entries, 0 to 10200
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                10201 non-null  int64  
 1   symbol                    10201 non-null  object 
 2   date                      10201 non-null  object 
 3   Sales Change              10201 non-null  float64
 4   Net Profit Margin Change  10201 non-null  float64
 5   EPS Change                10201 non-null  float64
 6   prev_qtr_sales_change     10201 non-null  float64
 7   prev_qtr_npm_change       10201 non-null  float64
 8   prev_qtr_eps_change       10201 non-null  float64
 9   Price Change              10201 non-null  float64
dtypes: float64(7), int64(1), object(2)
memory usage: 797.1+ KB


In [340]:
data.describe()

Unnamed: 0.1,Unnamed: 0,Sales Change,Net Profit Margin Change,EPS Change,prev_qtr_sales_change,prev_qtr_npm_change,prev_qtr_eps_change,Price Change
count,10201.0,10201.0,10201.0,10201.0,10201.0,10201.0,10201.0,10201.0
mean,5101.716302,94.394253,45.504628,82.020755,24.143334,45.555629,61.550902,0.438486
std,2945.341132,1441.607107,2488.635221,3300.687124,532.932691,3495.877719,3520.96321,0.496226
min,0.0,-99.699596,-30956.439833,-49600.0,-99.451604,-35300.18022,-106900.0,0.0
25%,2552.0,-0.968205,-27.051342,-29.032258,-5.72351,-25.031457,-28.708487,0.0
50%,5102.0,9.873,2.034683,11.049724,2.499349,0.256506,2.92887,0.0
75%,7652.0,23.141814,38.283802,55.021834,11.586774,31.311556,42.196532,1.0
max,10202.0,86179.525547,213504.672594,311400.0,40161.750205,335333.906692,263900.0,1.0


In [350]:
data['Price Change'].value_counts()

0.0    5728
1.0    4473
Name: Price Change, dtype: int64

## Modeling with Scikit-Learn

In [342]:
data.columns

Index(['Unnamed: 0', 'symbol', 'date', 'Sales Change',
       'Net Profit Margin Change', 'EPS Change', 'prev_qtr_sales_change',
       'prev_qtr_npm_change', 'prev_qtr_eps_change', 'Price Change'],
      dtype='object')

In [355]:
X = data.drop(['Unnamed: 0', 'symbol', 'date', 'Price Change'], axis=1)
y = data['Price Change']
print(X.shape)
print(y.shape)
# print(X.head())
# print(y.head())

(10201, 6)
(10201,)


## Split the dataset into training and testing set

In [344]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(7140, 6)
(7140,)
(3061, 6)
(3061,)


## Expermenting with most common algorithms

In [345]:
#Normal classification with decision tree
clf = DecisionTreeClassifier(max_depth=3)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print("Accuracy : " , score)
y_pred = clf.predict(X_test)
precision = metrics.precision_score(y_test, y_pred, average='binary')
print("Precision : " , precision)

Accuracy :  0.569095066971578
Precision :  0.5217391304347826


In [358]:
classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(3),
    LinearSVC(random_state=0, tol=1e-5),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

names = ["Logistic Regression", "Nearest Neighbors", "Linear SVM", "RBF SVM",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

# iterate over classifiers
for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    
    print(name , " : ", score)

Logistic Regression  :  0.5249918327344005
Nearest Neighbors  :  0.5223783077425678




Linear SVM  :  0.4828487422410977
RBF SVM  :  0.5589676576282261
Decision Tree  :  0.5445932701731461
Random Forest  :  0.5449199607971251
Neural Net  :  0.5289121202221496
AdaBoost  :  0.5472067951649787
Naive Bayes  :  0.5400196014374388
QDA  :  0.5488402482848742


## Stratified K-Fold Cross validation 

In [359]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)
sss.get_n_splits(X, y)

5

In [363]:
X_numpy = X.to_numpy()
y_numpy = y.to_numpy()
clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
scores = []
precisions = []
for train_index, test_index in sss.split(X_numpy,y_numpy):
#     print("Train Index: ", train_index)
#     print("Test Index: ", test_index)

    X_train, X_test, y_train, y_test = X_numpy[train_index], X_numpy[test_index], y_numpy[train_index], y_numpy[test_index]
#     print(X_train.shape)
#     print(X_test.shape)
#     print(y_train.shape)
#     print(y_test.shape)
    clf.fit(X_train, y_train)
    scores.append(clf.score(X_test, y_test))
    y_pred = clf.predict(X_test)
    precisions.append(precision_score(y_test, y_pred, average='binary', zero_division=0))
print("Average Score : " , np.mean(scores))
print("Average precision: ", np.mean(precisions))

Average Score :  0.5779258968829641
Average precision:  0.5321925678948123


## Results

Previous quarter change alone  
        accuracy  - 0.5548323858067046
        precision - 0.506757310984151

Over the year change alone
        accuracy  - 0.562007449519702
        precision - 0.5010936027408055

Both previous quarter and Over the year changes
        accuracy  - 0.5680454812781808
        precision - 0.5125012603295103