In [2]:
import numpy as np
import pandas as pd
import GaussianCopula as gc
import os
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import copy
import utils
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [3]:
year = 2014
filename = "cleaned_" + str(year) + "_Financial_Data.csv"
random_state = 24
df = pd.read_csv("Datasets/US_stock/"+filename)
df.rename(columns={'Unnamed: 0': 'Company'}, inplace = True)
shuffled_df = shuffle(df)

In [4]:
shuffled_df = shuffled_df.drop(['Company', str(year+1)+' PRICE VAR [%]', 'operatingCycle', 'cashConversionCycle'], axis = 1)
# drop the constant columns if exist 
for col in shuffled_df.columns:
    if len(shuffled_df[col].unique()) == 1:
        shuffled_df.drop(col,inplace=True,axis=1)

In [5]:
real_colnames = list(shuffled_df.columns.values)
colnames = [('X'+str(i)) for i in range(shuffled_df.shape[1]-1)]
colnames.append('Y')
colname_checker = {colnames[i]:real_colnames[i] for i in range(len(colnames))}
shuffled_df.columns = colnames

In [6]:
encoded_df = utils.encode_df(shuffled_df, [218])
encoded_df = encoded_df.astype({'X218': 'int32'})
cont_col = [i for i in range(218)]
unrankable = [218]

In [7]:
# replace the extreme values, too small? replace by 5% quantile value; too large? replace by 95% quantile value
inf = encoded_df[colnames[:-2]].quantile(0.05, axis = 0)
sup = encoded_df[colnames[:-2]].quantile(0.95, axis = 0)
for i in range(encoded_df.shape[1]-2):
    col = 'X'+str(i)
    encoded_df[col] = np.where((encoded_df[col] < inf[i]), inf[i], encoded_df[col])
    encoded_df[col] = np.where((encoded_df[col] > sup[i]), sup[i], encoded_df[col])

In [8]:
feature_val = {218: 11}
X = encoded_df[colnames[:-1]].to_numpy()
Y = encoded_df[colnames[-1]].to_numpy()
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=random_state)

In [8]:
clf = gc.CopulaClassifier(cont_col, unrankable, feature_val, 0.9, 'gaussian',use_default_bandwidth=False,use_custom_bandwidth=True)

with open("2014_opt_bandwidth.pickle", "rb") as file:
    custom_bandwidth = pickle.load(file)
clf.fit(x_train,y_train,custom_bandwidth)

Got discrete posterior
Finished clustering
Optimized bandwidth


In [9]:
prediction = clf.predict(x_test)

  a = 1/np.sqrt(abs(np.linalg.det(R)))
  b = np.matmul(vec, mid_mat)
  c = np.matmul(b, vec.T)
  density = sum(marginal_density) + np.log(utils.copula_func(marginal_cdf, R))
  prob_distribution[c] += density
  return a * np.exp(-0.5 * c)


In [10]:
utils.get_accuracy(prediction, y_test)

0.5590551181102362

In [11]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
clf2 = GaussianNB()
clf2.fit(x_train,y_train)
nb_predict = clf2.predict(x_test)
utils.get_accuracy(nb_predict,y_test)

0.6194225721784777

In [15]:
np.linalg.cholesky(np.corrcoef(X[:,:5].T))

array([[ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [-0.12044926,  0.99271948,  0.        ,  0.        ,  0.        ],
       [ 0.88542941,  0.02185214,  0.46425989,  0.        ,  0.        ],
       [ 0.89894697,  0.0142796 , -0.23426577,  0.36987833,  0.        ],
       [ 0.21552667,  0.04548236,  0.05525435,  0.17067112,  0.95880026]])

In [17]:
cluster = utils.cluster_agnes(1-abs(np.corrcoef(X.T)), max_distance = 0.2)
cluster

[[4],
 [8],
 [11],
 [12],
 [14],
 [20],
 [25],
 [38],
 [40],
 [48],
 [49],
 [50],
 [53],
 [54],
 [56],
 [61],
 [64],
 [65],
 [66],
 [67],
 [68],
 [70],
 [71],
 [72],
 [74],
 [81],
 [82],
 [84],
 [85],
 [86],
 [102],
 [103],
 [106],
 [116],
 [122],
 [124],
 [127],
 [133],
 [134],
 [145],
 [146],
 [147],
 [148],
 [149],
 [150],
 [153],
 [156],
 [157],
 [159],
 [160],
 [161],
 [162],
 [163],
 [164],
 [165],
 [166],
 [169],
 [170],
 [172],
 [180],
 [191],
 [192],
 [193],
 [194],
 [195],
 [196],
 [197],
 [198],
 [199],
 [200],
 [201],
 [202],
 [203],
 [204],
 [205],
 [206],
 [207],
 [208],
 [209],
 [210],
 [211],
 [212],
 [213],
 [214],
 [215],
 [216],
 [217],
 [218],
 [88, 98],
 [94, 168],
 [101, 182],
 [100, 181],
 [110, 152],
 [115, 123],
 [111, 151],
 [108, 177],
 [107, 179],
 [109, 178],
 [95, 183],
 [120, 158],
 [76, 144],
 [75, 143],
 [16, 17],
 [80, 141],
 [114, 155],
 [79, 142],
 [118, 131],
 [78, 139],
 [29, 30, 23],
 [93, 89, 97],
 [140, 77],
 [19, 18],
 [112, 113],
 [125, 126],
