<a href="https://www.kaggle.com/code/akashgpt04011995/complete-analysis-ipynb?scriptVersionId=175020267" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from collections import Counter
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
# col_names = pd.read_csv('/kaggle/input/data-science-challenge-predicting-stock-trends/column_names_dictionary.csv', sep='delimiter', header=None)
# print(col_names.shape)
# col_names = col_names[0].str.split(';', n=2, expand=True)
# col_names, col_names.columns = col_names[1:] , col_names.iloc[0]
# col_names.head(2)

### Load Train Data

In [None]:
# Fetching data from csv
train_data = pd.read_csv('/kaggle/input/data-science-challenge-predicting-stock-trends/training_data.csv', sep='delimiter', header=None)
print(train_data.shape)
train_data.head(2)

### Data Preparation

In [None]:
# Transforming into tabular data
train_data= train_data[0].str.split(';', n=160, expand=True)
train_data, train_data.columns = train_data[1:] , train_data.iloc[0]
train_data.head(2)

In [None]:
# Replacing , with .
train_data = pd.DataFrame({col: train_data[col].str.replace(',', '.') for col in train_data.columns})
train_data.head(2)

### Convert Categorical Feature to Numerical

In [None]:
# Convert the Group column to a one hot encoded Data Frame
display(train_data['Group'].value_counts())
train_data = pd.get_dummies(train_data, columns=['Group'], drop_first=True, prefix='G')
train_data = train_data.replace({False: 0, True: 1})
# Print the columns names
print(train_data.columns)
train_data.head(2)

In [None]:
train_data = train_data.drop(['Perform'], axis=1)
train_data.shape

In [None]:
for col in train_data.columns:
    train_data[col] = pd.to_numeric(train_data[col], errors='coerce')

In [None]:
train_data.dtypes

### Descriptive Analytics

In [None]:
# sns.violinplot(data=train_data, y='I3')
for col in train_data.columns[:-10]:
    plt.figure(figsize=(7,2))
    q1 = train_data[col].quantile(0.25)
    q3 = train_data[col].quantile(0.75)
    iqr = q3-q1 #Interquartile range
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.7*iqr
    df = train_data.loc[(train_data[col] >= fence_low) & (train_data[col] <= fence_high)].copy()
    
    ax = sns.distplot(df[col], bins=100, kde=False)
    ax.axvline(x=fence_low, linewidth=1, color='orange', ls='--')
    ax.axvline(x=fence_high, linewidth=1, color='orange', ls='--')
    print(df.shape)
    plt.show()

### Correlation

In [None]:
correlation_matrix = train_data.corrwith(df["Class"]).sort_values(ascending=False).reset_index() # till 55 and -45
correlation_matrix.tail(2)

In [None]:
corr_features = correlation_matrix[1:55]['index'].tolist() + correlation_matrix[-45:]['index'].tolist()

### Handle Outliers

### Handle Missing Value

In [None]:
# Check for NaN counts and Blank Values
train_data = train_data[(train_data != 'NA') & (train_data != '')]
# train_data[train_data != ''].isna().sum().values
train_data.isna().sum().values

In [None]:
col_with_missing_vals = []
col_with_minor_missing_vals = []

# summarize the number of rows with missing values for each column
for i in range(train_data.shape[1]):
    # count number of rows with missing values
    n_miss = train_data.iloc[:,i].isna().sum()
    perc = n_miss / train_data.shape[0] * 100
#     print('> %d, %s Missing: %d (%.2f%%)' % (i, train_data.columns[i], n_miss, perc))
    if perc >= 1.00:
        col_with_missing_vals.append(train_data.columns[i])
    elif perc <1.00 and perc >0.00:
        col_with_minor_missing_vals.append(train_data.columns[i])
print(f'{len(col_with_missing_vals)} out of {len(train_data.columns)} have missing values')
print(f'{len(col_with_minor_missing_vals)} out of {len(train_data.columns)} have missing values < 1%')

In [None]:
# Ist iteration, let's drop all nan rows from columns with < 1% missing values
print(train_data.shape)
train_data = train_data.dropna(subset=col_with_minor_missing_vals)
print(train_data.shape)

In [None]:
col_with_missing_vals = []
col_with_minor_missing_vals = []

# summarize the number of rows with missing values for each column
for i in range(train_data.shape[1]):
    # count number of rows with missing values
    n_miss = train_data.iloc[:,i].isna().sum()
    perc = n_miss / train_data.shape[0] * 100
    print('> %d, %s Missing: %d (%.2f%%)' % (i, train_data.columns[i], n_miss, perc))
    if perc > 0.00:
        col_with_missing_vals.append(train_data.columns[i])
print(f'{len(col_with_missing_vals)} out of {len(train_data.columns)} have missing values')

In [None]:
# # Techinique 1: Fill Missing Vals Using KNN
# # split into input and output elements
# data = train_data.values.copy()
# ix = [i for i in range(data.shape[1]) if i != 116]
# X, y = data[:, ix], data[:,116]
# # print total missing
# print('Missing: %d' % sum(pd.isna(X).flatten()))
# # define imputer
# # imputer = KNNImputer(n_neighbors=3)
# imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# # fit on the dataset
# imputer.fit(X)
# # transform the dataset
# Xtrans = imputer.transform(X)
# # print total missing
# print('Missing: %d' % sum(pd.isna(Xtrans).flatten()))

### Feature Engineering

In [None]:
def add_calculated_columns(X:pd.DataFrame) -> pd.DataFrame:
    X['profit_ratio'] = np.divide((X['I1'] + X['I2'] + X['I3'] + X['I4']), 4)
    X['liquidity'] = np.divide((X['I50'] + X['I51'] + X['I53']), 3)
    X['leverage'] = np.divide((X['I54'] + X['I55'] + X['I56']), 3)
    X['oper_eff'] = np.divide((X['I22'] + X['I23'] + X['I24'] + X['I25'] + X['I26']), 5)
    X['valuation'] = np.divide((X['I39'] + X['I40'] + X['I41'] + X['I42'] + X['I43']), 5)
    return X
calculated_columns = ['profit_ratio', 'liquidity', 'leverage', 'oper_eff', 'valuation']

In [None]:
print(train_data.shape)
train_data = add_calculated_columns(train_data)
print(train_data.shape)
train_data.head(2)

In [None]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

In [None]:
# # define modeling pipeline
# # model = RandomForestClassifier(n_estimators=150, max_depth=26, random_state=1111)
# model = SVC(kernel='rbf')
# # model = GaussianNB()
# # model = AdaBoostClassifier()
# # model = DecisionTreeClassifier()
# # model = LogisticRegression()
# # model = lgb.LGBMClassifier() # .477

# # imputer = KNNImputer()
# pipeline = Pipeline(steps=[('i', imputer), ('m', model)])
# # define model evaluation
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# # evaluate model
# scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# print('Mean Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores))) # .474 # RF & ADB .483 # SVC - .487 # NB - .33 # DT - .41

In [None]:
# # evaluate each strategy on the dataset
# results = list()
# strategies = [str(i) for i in [1,3,7,11,15,19]]
# for s in strategies:
#     # create the modeling pipeline
#     pipeline = Pipeline(steps=[('i', KNNImputer(n_neighbors=int(s))), ('m', SVC(kernel='rbf'))])
#     # evaluate the model
#     cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
#     scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
#     # store results
#     results.append(scores)
#     print('>%s %.3f (%.3f)' % (s, np.mean(scores), np.std(scores)))
# # plot model performance for comparison
# plt.boxplot(results, labels=strategies, showmeans=True)
# plt.show()

In [None]:
# # changing data types to float
# train_data = pd.concat([train_data.iloc[:,0], train_data[train_data.columns[1:]].astype(float)], axis=1)
# train_data.head(2)

### Selecting Corr Features Only

In [None]:
# Spliting input and target features
# train_data['Class'] = train_data['Class'].apply(lambda x:2.0 if x==-1.0 else x)
display(train_data['Class'].value_counts())
# X = train_data.drop(['Class'], axis=1)
X = train_data[corr_features+calculated_columns]
y = train_data['Class'].astype('int')
print(X.shape, y.shape)

### Train-Test Split

In [None]:
# Splitting train and set data
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.80, random_state=1111, stratify=y)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# fit on the dataset
X_train = imputer.fit_transform(X_train)
# transform the dataset
X_val = imputer.transform(X_val)

### Handle Imbalance Data

In [None]:
from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state = 21) 
print(Counter(y_train))
X_train, y_train = sm.fit_resample(X_train, y_train)
Counter(y_train)

### Modelling

In [None]:
# Model Fitting
# Create a random forest classifier
# model = RandomForestClassifier(n_estimators=150, max_depth=26, random_state=1111)
model = SVC(kernel='rbf')
# model = lgb.LGBMClassifier()
# Fit rfc using X_train and y_train
model.fit(X_train, y_train)
# Create predictions on X_test
preds = model.predict(X_val)
print(preds[0:5])
# Print model accuracy using score() and the testing data
print(model.score(X_val, y_val))

### Prediction

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
preds = model.predict(X_val)
cm = confusion_matrix(preds, y_val)
# ConfusionMatrixDisplay(cm, model.classes_).plot()
cm

In [None]:
cost_matrix = [[0,1,2], [1,0,1], [2,1,0]]
cost_matrix

#### Cost Calulation

In [None]:
err = np.sum((cm*cost_matrix)/len(y_val))
print(err)

In [None]:
# ### defining the model 
# import xgboost as xgb
# from xgboost import XGBClassifier
# model=xgb.XGBClassifier(n_estimators=1000, learning_rate=0.05, tree_method = 'gpu_hist',min_child_weight=5, reg_lambda=20, gamma=2 ,random_state=69,
#                        reg_alpha=26,subsample=0.9,colsample_bytree=0.12,max_depth=30)
# # Fit rfc using X_train and y_train
# model.fit(X_train, y_train)
# # Create predictions on X_test
# preds = model.predict(X_val)
# print(preds[0:5])
# # Print model accuracy using score() and the testing data
# print(model.score(X_val, y_val))

### Prediction on Test Data

In [None]:
# Fetching data from csv
test_data = pd.read_csv('/kaggle/input/data-science-challenge-predicting-stock-trends/test_data_no_target.csv', sep='delimiter', header=None)
print(test_data.shape)
test_data.head(2)

In [None]:
# Transforming into tabular data
test_data= test_data[0].str.split(';', n=160, expand=True)
test_data, test_data.columns = test_data[1:] , test_data.iloc[0]
test_data.head(2)

In [None]:
# Replacing , with .
test_data = pd.DataFrame({col: test_data[col].str.replace(',', '.') for col in test_data.columns})
test_data.head(2)

In [None]:
test_data = pd.get_dummies(test_data, columns=['Group'], drop_first=True, prefix='G')
test_data = test_data.replace({False: 0, True: 1})

In [None]:
for col in test_data.columns:
    test_data[col] = pd.to_numeric(test_data[col], errors='coerce')
print(test_data.dtypes)

In [None]:
# Check for NaN counts and Blank Values
test_data = test_data[(test_data != 'NA') & (test_data != '')]
# train_data[train_data != ''].isna().sum().values
test_data.isna().sum().values

In [None]:
test_data = add_calculated_columns(test_data)
print(test_data.shape)
test_data.head(2)

In [None]:
X_test = test_data[corr_features+calculated_columns]

In [None]:
# transform the dataset
X_test = imputer.transform(X_test)

In [None]:
test_preds = model.predict(X_test)
print(test_preds[:5])
print(test_preds.shape)

In [None]:
pd.DataFrame(test_preds).to_csv('test_preds.csv', index=False)