## Preliminary Analysis Using Google Trends 

In [None]:
#Importing the data and storing it in variable df_all

import pandas as pd
from sklearn.model_selection import train_test_split
df_all = pd.read_csv("crox-early-data.csv")

df_all = df_all.dropna()

df_train, df_test = train_test_split(df_all, test_size=0.3, random_state=1)
X_train = df_train.drop(columns=['Close Higher'])
y_train = df_train['Close Higher']

X_test = df_test.drop(columns=['Close Higher'])
y_test = df_test['Close Higher']

X_train.head()

In [None]:
df_all.groupby("Close Higher").mean(numeric_only=True)

In [None]:
#Choosing features
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()

cols = ["Previous GT", "TTM PE"]

LR.fit(X_train[cols], y_train)
LR.score(X_train[cols], y_train)
LR.coef_[0]

In [None]:
def linear_score(w, x0, x1):
    return w[0]*x0 + w[1]*x1

In [None]:
#Predict makes binary predictions for data using a supplied score function with weights w and a supplied threshold. Taken from lecture notes from week 2.
#We begin with a 0 threshold but later on test others to find an optimal threshold

t = 0

def predict(score_fun, w, threshold, df):
    """
    make binary predictions for data df using a supplied score function with weights w and supplied threshold. 
    """
    scores = score_fun(w, df["Previous GT"], df["TTM PE"])
    return 1*(scores > threshold)

df_train["decision"] = predict(linear_score, LR.coef_[0], t, df_all)
(df_train["decision"] == df_train["Close Higher"]).mean()

In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt


iterations = 200
predictions = []
for i in range(iterations):
    threshold = (-iterations/2)+(i)
    df_train["decision"] = predict(linear_score, LR.coef_[0], threshold, df_train)
    predictions.append((threshold, (df_train["decision"] == df_train["Close Higher"]).mean()))


predictions_df = pd.DataFrame(data=predictions)
predictions_df.columns =['Threshold', 'Accuracy']

sns.relplot(data=predictions_df, x="Threshold", y="Accuracy")

t = predictions_df['Threshold'][predictions_df['Accuracy'].idxmax()]

predictions_df['Threshold'][predictions_df['Accuracy'].idxmax()], predictions_df['Accuracy'].max()


In [None]:
df_test["decision"] = predict(linear_score, LR.coef_[0], t, df_test)
(df_test["decision"] == df_test["Close Higher"]).mean()