This code combines the control features with bert predictions and consumer images. Then splits data to train NN, decision tree and RF and save performance metrics in tables. At the end, the baseline models are trained and tested

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Thesis DSS/balanced_data/balanced_model_outputs_data.csv', sep = ";", index_col = 0)

In [None]:
# Apply softmax function to each row
def softmax(x):
    e_x = np.exp(x - np.max(x))  # subtracting the max value for numerical stability
    return e_x / e_x.sum(axis=0)

probabilities = softmax(data.iloc[:, 2:4].values.T)  # Transpose the DataFrame values before applying softmax

# The second row will contain the probabilities for class 1
class_1_probabilities = probabilities[1]

In [None]:
#getting control features
control_features = pd.read_csv('/content/drive/MyDrive/Thesis DSS/balanced_data/balanced_selection_variables.csv', sep = ";")

In [None]:
control_features['bert_probability'] = class_1_probabilities

In [None]:
control_features.drop(columns = ['review'], inplace = True)

# Consumer pictures

In [None]:
cp_features = pd.read_csv('/content/drive/MyDrive/Thesis DSS/consumer_photo_features.csv', sep = ";")

In [None]:
amazon = pd.read_csv('/content/drive/MyDrive/Thesis DSS/balanced_data/balanced_dataset.csv', sep = ";")
amazon['id'] = amazon.index

In [None]:
amazon = amazon[['id', 'picture']]

#get list of picture and then explode
import ast
def return_list(string):
  if string == 'no':
    return ['no']
  else:
    return ast.literal_eval(string)
amazon['picture_list'] = amazon['picture'].apply(return_list)
amazon_explode = amazon.explode('picture_list')
amazon_explode.reset_index(drop = True, inplace = True)

#get the names of the images, instead of link
def get_id(link):
  if link =='no':
    return 'no'
  else:
    return link.split('/')[-1].split('.')[0]
amazon_explode['Name'] = amazon_explode['picture_list'].apply(get_id)

#merge and groupby to calculate the mean
amazon_merged = pd.merge(amazon_explode, cp_features, on='Name', how='left')
amazon_merged.drop(['picture', 'picture_list', 'Name'], inplace = True, axis = 1)
amazon_merged = amazon_merged.groupby('id').mean()

#impute black images
cp_features['Height'].iloc[-1] = 0
cp_features['Width'].iloc[-1] = 0
black_image = cp_features[cp_features['Name'] == 'other'].iloc[:, 1:]
black_image_df = pd.DataFrame(np.repeat(black_image.values, amazon_merged.shape[0], axis=0), columns=amazon_merged.columns)
amazon_merged = amazon_merged.fillna(black_image_df)

amazon_merged.rename(columns=lambda x: x.replace('Column_', 'cp_'), inplace = True)
amazon_merged['id'] = amazon_merged.index

In [None]:
control_features = pd.concat([control_features, amazon_merged], axis = 1)

In [None]:
control_features.drop(['id'], inplace = True, axis = 1)

In [None]:
del amazon_merged
del black_image_df
del cp_features
del amazon

# Analysis

In [None]:
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(control_features.drop(columns=['helpful']), control_features['helpful'], test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [None]:
#only drop if only looking at text
X_train.drop(['n_pictures'], axis = 1, inplace = True)
X_val.drop(['n_pictures'], axis = 1, inplace = True)
X_test.drop(['n_pictures'], axis = 1, inplace = True)

In [None]:
dy_train = (y_train != 0).astype(int)
dy_val = (y_val != 0).astype(int)
dy_test = (y_test != 0).astype(int)

In [None]:
X_train = X_train[X_train['n_pictures'] != 0]
X_val = X_val[X_val['n_pictures'] != 0]
X_test = X_test[X_test['n_pictures'] != 0]
dy_train = dy_train[X_train.index]
dy_val = dy_val[X_val.index]
dy_test = dy_test[X_test.index]

In [None]:
#used to get equal sample sizes
filter_train = pd.concat([X_train, dy_train], axis = 1)
sample_size = filter_train.helpful.value_counts()[0]
filter_train1 = filter_train[filter_train['helpful'] == 1].sample(sample_size, random_state = 42, replace = False)
filter_train = pd.concat([filter_train1, filter_train[filter_train['helpful'] == 0]])
X_train = filter_train.iloc[:, :-1]
dy_train = filter_train.iloc[:, -1].astype(int)

filter_train = pd.concat([X_val, dy_val], axis = 1)
sample_size = filter_train.helpful.value_counts()[0]
filter_train1 = filter_train[filter_train['helpful'] == 1].sample(sample_size, random_state = 42, replace = False)
filter_train = pd.concat([filter_train1, filter_train[filter_train['helpful'] == 0]])
X_val = filter_train.iloc[:, :-1]
dy_val = filter_train.iloc[:, -1]

filter_train = pd.concat([X_test, dy_test], axis = 1)
sample_size = filter_train.helpful.value_counts()[0]
filter_train1 = filter_train[filter_train['helpful'] == 1].sample(sample_size, random_state = 42, replace = False)
filter_train = pd.concat([filter_train1, filter_train[filter_train['helpful'] == 0]])
X_test = filter_train.iloc[:, :-1]
dy_test = filter_train.iloc[:, -1]

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

#normalize the variables
col_normalized = [
    'days_diff', 'rating', 'price_sd', 'price_diff',
       'rating_diff', 'rating_deviation_sd', 'reviews_sd', 'reviews_diff',
                  'review_len', 'price_new_mean', 'rating_mean',
       'price_new_min', 'rating_min', 'price_new_max', 'rating_max',
       'price_new_median', 'rating_median',
       'Height', "Width", 'n_pictures'
       ]

#normalize selected columns
X_train_normalized = pd.DataFrame(scaler.fit_transform(X_train[col_normalized]), columns = col_normalized)
X_val_normalized = pd.DataFrame(scaler.transform(X_val[col_normalized]), columns = col_normalized)
X_test_normalized = pd.DataFrame(scaler.transform(X_test[col_normalized]), columns = col_normalized)

#drop old columns
X_train = X_train.drop(col_normalized, axis=1)
X_val = X_val.drop(col_normalized, axis=1)
X_test = X_test.drop(col_normalized, axis=1)

#concat new normalized columns
X_train_normalized.index = X_train.index
X_val_normalized.index = X_val.index
X_test_normalized.index = X_test.index

X_train = pd.concat([X_train, X_train_normalized], axis = 1)
X_val = pd.concat([X_val, X_val_normalized], axis = 1)
X_test = pd.concat([X_test, X_test_normalized], axis = 1)

In [None]:
#reorder if pictures are included so all control variables are at the end
dummy_col = ['hedonic', 'experience', 'has_video'] + ['Digital Photo Frames', 'Dvd Player', 'Electric Toothbrush', 'Lipstick', 'Mascara', 'Microwave Oven', 'Party Dress', 'Printer', 'Razor', 'Smartphone', 'Vacuum Cleaner'] + ['bert_probability']
dummies = X_train[dummy_col]
X_train.drop(dummy_col, inplace = True, axis = 1)
X_train = pd.concat([X_train, dummies], axis = 1)

dummies_val = X_val[dummy_col]
X_val.drop(dummy_col, inplace = True, axis = 1)
X_val = pd.concat([X_val, dummies_val], axis = 1)

dummies_test = X_test[dummy_col]
X_test.drop(dummy_col, inplace = True, axis = 1)
X_test = pd.concat([X_test, dummies_test], axis = 1)

## PCA

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Perform PCA
pca = PCA()
pca.fit(X_train.iloc[:, 0:1000])

# Plot the explained variance ratio
plt.figure(figsize=(8, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o', linestyle='-')
plt.title('Explained Variance Ratio')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.grid(True)
plt.show()


In [None]:
#PCA
from sklearn.decomposition import PCA

# Assuming you already have your datasets X_train and X_val

# Create a PCA object
pca = PCA(n_components=20)  # Specify the number of components (dimensions) you want to reduce to

# Fit PCA to your training data
pca.fit(X_train.iloc[:, 0:1000])

# Transform both the training and validation data to the reduced dimensionality space
X_train_pca = pd.DataFrame(pca.transform(X_train.iloc[:, 0:1000]))
X_val_pca = pd.DataFrame(pca.transform(X_val.iloc[:, 0:1000]))
X_test_pca = pd.DataFrame(pca.transform(X_test.iloc[:, 0:1000]))

In [None]:
X_train_pca.index = X_train.index
X_val_pca.index = X_val.index
X_test_pca.index = X_test.index

In [None]:
X_train_pca = pd.concat([X_train_pca, X_train.iloc[:, 1000:]], axis = 1)
X_val_pca = pd.concat([X_val_pca, X_val.iloc[:, 1000:]], axis = 1)
X_test_pca = pd.concat([X_test_pca, X_test.iloc[:, 1000:]], axis = 1)

In [None]:
print(X_train_pca.shape)
print(X_val_pca.shape)
print(X_test_pca.shape)
print(len(dy_train))
print(len(dy_val))
print(len(dy_test))

## Image only

In [None]:
dy_train = dy_train[X_train_pca.index]
dy_val = dy_val[X_val_pca.index]
dy_test = dy_test[X_test_pca.index]

## Model

## Neural Network

In [None]:
#complex model
from keras.optimizers import Adam
from keras.optimizers import AdamW
from keras.layers import BatchNormalization, Dropout, LeakyReLU
# Define the optimizer
optimizer = Adam(learning_rate=0.0005)

X_train_input = X_train_pca
X_val_input = X_val_pca

model = Sequential()
model.add(Dense(64, input_dim=X_train_input.shape[1], activation = 'relu'))
model.add(Dense(32, input_dim=X_train_input.shape[1], activation = 'relu'))
#model.add(Dense(1, activation='sigmoid', input_dim=X_train.iloc[:, :].shape[1]))  # Output layer with 1 neuron and sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))
# Compile the model
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train_input, dy_train, epochs=10, batch_size=64, validation_data=(X_val_input, dy_val))

In [None]:
df_history = pd.DataFrame(history.history)
df_history.to_csv('/content/drive/MyDrive/Thesis DSS/balanced_data/Best models/imgonly_balanced/2layer(64_32)_pca20_imgonly.csv', sep = ";")

In [None]:
#get the performance metrics
category_performances = pd.DataFrame({'class' : [], 'accuracy_train' : [], 'accuracy_val' : [],
                            'precision' : [], 'recall' : [], 'fscore' : [], 'support' : []})

predictions = model.predict(X_val)
test_pred = (predictions > 0.5).astype(int)
precision, recall, fscore, support = precision_recall_fscore_support(dy_val, test_pred)
train_acc = df_history['accuracy'].iloc[-1]
val_acc = df_history['val_accuracy'].iloc[-1]

#category_performances = pd.concat([category_performances, pd.DataFrame({'hedonic' : h, 'experience' : e, 'accuracy' : test_accuracy, 'loss' : test_loss}, index = [0])], axis = 0)
category_performances = pd.concat([category_performances, pd.DataFrame({'class' : [0], 'accuracy_train' : [train_acc], 'accuracy_val' : [val_acc],
                                              'precision' : [precision[0]], 'recall' : [recall[0]], 'fscore' : [fscore[0]], 'support' : [support[0]]}, index = [0])])
category_performances = pd.concat([category_performances, pd.DataFrame({'class' : [1], 'accuracy_train' : [train_acc], 'accuracy_val' : [val_acc],
                                              'precision' : [precision[1]], 'recall' : [recall[1]], 'fscore' : [fscore[1]], 'support' : [support[1]]}, index = [0])])
category_performances = pd.concat([category_performances, pd.DataFrame({'class' : ['mean'], 'accuracy_train' : [train_acc], 'accuracy_val' : [val_acc],
                                              'precision' : [np.average(precision, weights = support)], 'recall' : [np.average(recall, weights = support)], 'fscore' : [np.average(fscore, weights = support)], 'support' : [np.average(support, weights = support)]}, index = [0])])


In [None]:
category_performances.to_csv('/content/drive/MyDrive/Thesis DSS/balanced_data/Best models/2layer(128_32_16)_pca20_imgonly_category_performances.csv', sep = ";")

In [None]:
model.save("/content/drive/MyDrive/Thesis DSS/balanced_data/Best models/2layer(128_32_16)_pca20_imgonly.h5")

# Random Forest

In [None]:
!pip install \
    --extra-index-url=https://pypi.nvidia.com \
    cuml-cu12==24.4.*

In [None]:
from cuml.ensemble import RandomForestClassifier

## Decision Tree

In [None]:
#grid search decision tree
X_train_input = X_train_pca
X_val_input = X_val_pca
grid_search = pd.DataFrame({'class' : [], 'n' : [], 'max_depth' : [], 'features' : [], 'accuracy_train' : [], 'accuracy_val' : [],
                            'precision' : [], 'recall' : [], 'fscore' : [], 'support' : []})
cols = [
    #range(720)) + #first layer
   #list(range(3072, 4072)) + #cp
    #list(range(4072, 5072)), #pf
    list(range(0, X_train_input.shape[1])) #control
][0]


#col = 3072
#end_col = X_train.shape[1]
for n_estimators in [1]:
  for max_depth in [3, 5, 10, 20]:
    for max_features in ['log2', 'sqrt', 0.3, 0.5]:
      rf_classifier = RandomForestClassifier(n_estimators=n_estimators, max_depth = max_depth, max_features = max_features, random_state=42, bootstrap = False)
      rf_classifier.fit(X_train_input.iloc[:, cols], dy_train)

      train_pred = rf_classifier.predict(X_train_input.iloc[:, cols])
      print(n_estimators, max_depth, max_features)
      accuracy_train = accuracy_score(dy_train, train_pred)
      print(accuracy_train)

      val_pred = rf_classifier.predict(X_val_input.iloc[:, cols]) #this returns 0.75
      #rf_classifier.fit(X_train, dy_train)
      #val_pred = rf_classifier.predict(X_val)
      accuracy = accuracy_score(dy_val, val_pred)
      precision, recall, fscore, support = precision_recall_fscore_support(dy_val, val_pred)
      grid_search = pd.concat([grid_search, pd.DataFrame({'class' : [0], 'n' : [n_estimators], 'max_depth' : [max_depth], 'features' : [max_features], 'accuracy_train' : [accuracy_train], 'accuracy_val' : [accuracy],
                                                    'precision' : [precision[0]], 'recall' : [recall[0]], 'fscore' : [fscore[0]], 'support' : [support[0]]}, index = [0])])
      grid_search = pd.concat([grid_search, pd.DataFrame({'class' : [1], 'n' : [n_estimators], 'max_depth' : [max_depth], 'features' : [max_features], 'accuracy_train' : [accuracy_train], 'accuracy_val' : [accuracy],
                                                    'precision' : [precision[1]], 'recall' : [recall[1]], 'fscore' : [fscore[1]], 'support' : [support[1]]}, index = [0])])
      grid_search = pd.concat([grid_search, pd.DataFrame({'class' : ['mean'], 'n' : [n_estimators], 'max_depth' : [max_depth], 'features' : [max_features], 'accuracy_train' : [accuracy_train], 'accuracy_val' : [accuracy],
                                                     'precision' : [np.average(precision, weights = support)], 'recall' : [np.average(recall, weights = support)], 'fscore' : [np.average(fscore, weights = support)], 'support' : [np.average(support, weights = support)]}, index = [0])])

In [None]:
grid_search.to_csv('/content/drive/MyDrive/Thesis DSS/balanced_data/Best models/imgonly_balanced/dt_imgonly.csv', sep = ";", index = False)

## Random forest

In [None]:
#grid search random forest
X_train_input = X_train_pca
X_val_input = X_val_pca

grid_search = pd.DataFrame({'class' : [], 'n' : [], 'max_depth' : [], 'features' : [], 'min_samples' : [],'accuracy_train' : [], 'accuracy_val' : [],
                            'precision' : [], 'recall' : [], 'fscore' : [], 'support' : []})
col = 0
end_col = X_train_input.shape[1]
for n_estimators in [20, 100, 500, 2000]:
  #increased depth for cp only
  for max_depth in [5, 10, 20]:
    for max_features in ['log2', 'sqrt']:
      for min_samples in [25, 50, 100]:
        rf_classifier = RandomForestClassifier(n_estimators=n_estimators, max_depth = max_depth, max_features = max_features, min_samples_leaf = min_samples, random_state=42)
        rf_classifier.fit(X_train_input.iloc[:, col:end_col], dy_train)

        train_pred = rf_classifier.predict(X_train_input.iloc[:, col:end_col])
        print(n_estimators, max_depth, max_features, min_samples)
        accuracy_train = accuracy_score(dy_train, train_pred)

        val_pred = rf_classifier.predict(X_val_input.iloc[:, col:end_col]) #this returns 0.75
        #rf_classifier.fit(X_train, dy_train)
        #val_pred = rf_classifier.predict(X_val)
        accuracy = accuracy_score(dy_val, val_pred)
        print("train: ", round(accuracy_train, 3), "val: ", round(accuracy, 3))
        precision, recall, fscore, support = precision_recall_fscore_support(dy_val, val_pred)
        #only take models that don't overfit into consideration
        #if accuracy_train - accuracy < 0.05:
        grid_search = pd.concat([grid_search, pd.DataFrame({'class' : [0], 'n' : [n_estimators], 'max_depth' : [max_depth], 'features' : [max_features], 'min_samples' : [min_samples], 'accuracy_train' : [accuracy_train], 'accuracy_val' : [accuracy],
                                                      'precision' : [precision[0]], 'recall' : [recall[0]], 'fscore' : [fscore[0]], 'support' : [support[0]]}, index = [0])])
        grid_search = pd.concat([grid_search, pd.DataFrame({'class' : [1], 'n' : [n_estimators], 'max_depth' : [max_depth], 'features' : [max_features], 'min_samples' : [min_samples], 'accuracy_train' : [accuracy_train], 'accuracy_val' : [accuracy],
                                                      'precision' : [precision[1]], 'recall' : [recall[1]], 'fscore' : [fscore[1]], 'support' : [support[1]]}, index = [0])])
        grid_search = pd.concat([grid_search, pd.DataFrame({'class' : ['mean'], 'n' : [n_estimators], 'max_depth' : [max_depth], 'features' : [max_features], 'min_samples' : [min_samples], 'accuracy_train' : [accuracy_train], 'accuracy_val' : [accuracy],
                                                      'precision' : [np.average(precision, weights = support)], 'recall' : [np.average(recall, weights = support)], 'fscore' : [np.average(fscore, weights = support)], 'support' : [np.average(support, weights = support)]}, index = [0])])
grid_search

In [None]:
grid_search['accuracy_val'].max()

In [None]:
grid_search.to_csv('/content/drive/MyDrive/Thesis DSS/balanced_data/Best models/imgonly_balanced/rf_imgonly.csv', sep = ";", index = False)

# Compare all models

In [None]:
import os
import pandas as pd

# Define the directory containing the CSV files
directory = '/content/drive/MyDrive/Thesis DSS/balanced_data/history/'

# Initialize variables to store maximum accuracy and corresponding file name
max_accuracy = 0
max_accuracy_file = ''

# Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        filepath = os.path.join(directory, filename)

        # Read the CSV file into a pandas DataFrame
        df = pd.read_csv(filepath, sep = ";")

        # Find the maximum accuracy in the 'val_accuracy' column
        try:
          try:
            #get last epoch of the nn
            max_val_accuracy = df['val_accuracy'].iloc[-1]
            print(filename, max_val_accuracy)
          except:
            max_val_accuracy = df['accuracy_val'].max()
            print(filename, max_val_accuracy)
        except:
          print('failed ', filename)

#Best model

In [None]:
X_test_input = pd.concat([X_test_pca, dy_test], axis = 1)
X_test_input = X_test_input[(X_test_input['hedonic'] == h) & (X_test_input['experience'] == e)]
rf_classifier.predict(X_test_input.iloc[:, :-1])

In [None]:
from keras.models import load_model

# Load the saved model
model = load_model("/content/drive/MyDrive/Thesis DSS/balanced_data/history/2layer(128_16)_pca20.h5")

In [None]:
#get the performance metrics for test set per class
X_train_input = X_train_pca
X_val_input = X_val_pca
X_test_input = X_test_pca



category_performances = pd.DataFrame({'class' : [], 'hedonic' : [], 'experience' : [], 'accuracy' : [], 'loss' : [],
                                      'tn' : [], 'fp' : [], 'fn' : [], 'tp' : [],
                            'precision' : [], 'recall' : [], 'fscore' : [], 'support' : []})

def get_performance_metrics(h, e, X_test_input, category_performances):
    test_loss, test_accuracy = model.evaluate(X_test_input.iloc[:, :-1], X_test_input['helpful'])
    predictions = model.predict(X_test_input.iloc[:, :-1])
    test_pred = (predictions > 0.5).astype(int)
    precision, recall, fscore, support = precision_recall_fscore_support( X_test_input['helpful'], test_pred)
    print(test_accuracy)
    cm = confusion_matrix(X_test_input['helpful'], test_pred)
    #category_performances = pd.concat([category_performances, pd.DataFrame({'hedonic' : h, 'experience' : e, 'accuracy' : test_accuracy, 'loss' : test_loss}, index = [0])], axis = 0)
    category_performances = pd.concat([category_performances, pd.DataFrame({'class' : [0], 'hedonic' : [h], 'experience' : [e], 'accuracy' : [test_accuracy], 'loss' : [test_loss],
                                                                            'tn' : cm[0,0], 'fp' : cm[0,1], 'fn' : cm[1,0], 'tp' : cm[1,1],
                                                  'precision' : [precision[0]], 'recall' : [recall[0]], 'fscore' : [fscore[0]], 'support' : [support[0]]}, index = [0])])
    category_performances = pd.concat([category_performances, pd.DataFrame({'class' : [1], 'hedonic' : [h],  'experience' : [e], 'accuracy' : [test_accuracy], 'loss' : [test_loss],
                                                                            'tn' : cm[0,0], 'fp' : cm[0,1], 'fn' : cm[1,0], 'tp' : cm[1,1],
                                                  'precision' : [precision[1]], 'recall' : [recall[1]], 'fscore' : [fscore[1]], 'support' : [support[1]]}, index = [0])])
    category_performances = pd.concat([category_performances, pd.DataFrame({'class' : ['mean'], 'hedonic' : [h], 'experience' : [e], 'accuracy' : [test_accuracy], 'loss' : [test_loss],
                                                                            'tn' : cm[0,0], 'fp' : cm[0,1], 'fn' : cm[1,0], 'tp' : cm[1,1],
                                                  'precision' : [np.average(precision, weights = support)], 'recall' : [np.average(recall, weights = support)], 'fscore' : [np.average(fscore, weights = support)], 'support' : [np.average(support, weights = support)]}, index = [0])])
    return category_performances

for h in [0,1]:
  for e in [0,1]:
    X_test_input = pd.concat([X_test_pca, dy_test], axis = 1)
    X_test_input = X_test_input[(X_test_input['hedonic'] == h) & (X_test_input['experience'] == e)]
    category_performances = get_performance_metrics(h, e, X_test_input, category_performances)

In [None]:
#add also the full dataset
X_test_input = pd.concat([X_test_pca, dy_test], axis = 1)
category_performances = get_performance_metrics('full', 'full', X_test_input, category_performances)

In [None]:
category_performances.to_csv('/content/drive/MyDrive/Thesis DSS/balanced_data/history/category_performances.csv', sep = ";")

In [None]:
X_test_pca.groupby(['hedonic', 'experience']).size()

#Baseline

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')

In [None]:
X_train['review'] = X_train['review'].fillna('')
X_val['review'] = X_val['review'].fillna('')
X_test['review'] = X_test['review'].fillna('')

In [None]:
X_train['review_clean'] = X_train['review'].str.replace('[^\w\s]','')
print(X_train['review_clean'][0])
#lowercase everything
X_train['review_clean'] = X_train['review_clean'].str.lower()
print(X_train['review_clean'][0])
#remove stopwords
stop = stopwords.words('english')
X_train['review_clean'] = X_train['review_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
print(X_train['review_clean'][0])
#stemming

porter = PorterStemmer()
X_train['review_clean'] = X_train['review_clean'].apply(lambda x: ' '.join([porter.stem(word) for word in x.split()]))
print(X_train['review_clean'][0])

In [None]:
X_val['review_clean'] = X_val['review'].str.replace('[^\w\s]','')
#lowercase everything
X_val['review_clean'] = X_val['review_clean'].str.lower()
#remove stopwords
X_val['review_clean'] = X_val['review_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
#stemming
porter = PorterStemmer()
X_val['review_clean'] = X_val['review_clean'].apply(lambda x: ' '.join([porter.stem(word) for word in x.split()]))

In [None]:
X = vectorizer.fit_transform(X_train['review_clean'])
X_v = vectorizer.transform(X_val['review_clean'])

In [None]:
# Create a KNN classifier
from cuml.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

# Fit the classifier to your data
knn.fit(X, dy_train)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
category_performances = pd.DataFrame({'class' : [], 'p1' : [], 'p2' : [], 'train_acc' : [], 'test_acc' : [],
                                      'tn' : [], 'fp' : [], 'fn' : [], 'tp' : [],
                            'precision' : [], 'recall' : [], 'fscore' : [], 'support' : []})


def get_performance_metrics(p1, p2, X, X_test_input, dy_train, dy_val, category_performances, model):
    y_train_pred = model.predict(X)
    train_acc = accuracy_score(dy_train, y_train_pred)

    test_pred = model.predict(X_test_input)

    test_acc = accuracy_score(dy_val, test_pred)
    print(test_acc)

    precision, recall, fscore, support = precision_recall_fscore_support(dy_val, test_pred)
    cm = confusion_matrix(dy_val, test_pred)
    #category_performances = pd.concat([category_performances, pd.DataFrame({'hedonic' : h, 'experience' : e, 'accuracy' : test_accuracy, 'loss' : test_loss}, index = [0])], axis = 0)
    category_performances = pd.concat([category_performances, pd.DataFrame({'class' : [0], 'p1' : [p1], 'p2' : [p2], 'train_acc' : [train_acc], 'test_acc' : [test_acc],
                                                                            'tn' : cm[0,0], 'fp' : cm[0,1], 'fn' : cm[1,0], 'tp' : cm[1,1],
                                                  'precision' : [precision[0]], 'recall' : [recall[0]], 'fscore' : [fscore[0]], 'support' : [support[0]]}, index = [0])])
    category_performances = pd.concat([category_performances, pd.DataFrame({'class' : [1], 'p1' : [p1],  'p2' : [p2], 'train_acc' : [train_acc], 'test_acc' : [test_acc],
                                                                            'tn' : cm[0,0], 'fp' : cm[0,1], 'fn' : cm[1,0], 'tp' : cm[1,1],
                                                  'precision' : [precision[1]], 'recall' : [recall[1]], 'fscore' : [fscore[1]], 'support' : [support[1]]}, index = [0])])
    category_performances = pd.concat([category_performances, pd.DataFrame({'class' : ['mean'], 'p1' : [p1], 'p2' : [p2], 'train_acc' : [train_acc], 'test_acc' : [test_acc],
                                                                            'tn' : cm[0,0], 'fp' : cm[0,1], 'fn' : cm[1,0], 'tp' : cm[1,1],
                                                  'precision' : [np.average(precision, weights = support)], 'recall' : [np.average(recall, weights = support)], 'fscore' : [np.average(fscore, weights = support)], 'support' : [np.average(support, weights = support)]}, index = [0])])
    return category_performances


In [None]:
for k in [5, 10, 50]:
  knn = KNeighborsClassifier(n_neighbors=k)

  # Fit the classifier to your data
  knn.fit(X, dy_train)
  category_performances = get_performance_metrics('knn', k, X, X_v, dy_train, dy_val, category_performances, knn)

In [None]:
category_performances

In [None]:
from cuml.naive_bayes import MultinomialNB

In [None]:
for k in [0.1, 1, 10]:
  # Initialize the Naive Bayes classifier with Laplace smoothing parameter set to 1
  nb_classifier = MultinomialNB(alpha=k)

  # Fit the classifier to your data
  nb_classifier.fit(X, dy_train)
  category_performances = get_performance_metrics('nb', k, X, X_v, dy_train, dy_val, category_performances, nb_classifier)

In [None]:
from cuml.linear_model import LogisticRegression
for c in [0.1, 0.5, 1]:
  reg = LogisticRegression(C = c)
  # Fit the classifier to your data
  reg.fit(X, dy_train)
  category_performances = get_performance_metrics('lr', c, X, X_v, dy_train, dy_val, category_performances, reg)

In [None]:
category_performances

In [None]:
reg = LogisticRegression(C = c)
# Fit the classifier to your data
reg.fit(X, dy_train)
y_train_pred = model.predict(X)
train_acc = accuracy_score(y_train, y_train_pred)

In [None]:
category_performances.to_csv('/content/drive/MyDrive/Thesis DSS/balanced_data/history/base_line_performance.csv', sep = ";")

# Test performance random forest

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=2000, max_depth = 10, max_features = 'log2', min_samples_leaf = 50, random_state=42)
rf_classifier.fit(X_train_pca, dy_train)

In [None]:
category_performances = pd.DataFrame({'class' : [], 'h' : [], 'e' : [], 'test_acc' : [],
                                      'tn' : [], 'fp' : [], 'fn' : [], 'tp' : [],
                            'precision' : [], 'recall' : [], 'fscore' : [], 'support' : []})


def get_performance_metrics(h, e, X_test_input, category_performances, model):
    test_pred = model.predict(X_test_input.iloc[:, :-1])
    dy_val = X_test_input.iloc[:, -1]
    test_acc = accuracy_score(dy_val, test_pred)
    print(test_acc)

    precision, recall, fscore, support = precision_recall_fscore_support(dy_val, test_pred)
    cm = confusion_matrix(dy_val, test_pred)
    #category_performances = pd.concat([category_performances, pd.DataFrame({'hedonic' : h, 'experience' : e, 'accuracy' : test_accuracy, 'loss' : test_loss}, index = [0])], axis = 0)
    category_performances = pd.concat([category_performances, pd.DataFrame({'class' : [0], 'h' : [h], 'e' : [e], 'test_acc' : [test_acc],
                                                                            'tn' : cm[0,0], 'fp' : cm[0,1], 'fn' : cm[1,0], 'tp' : cm[1,1],
                                                  'precision' : [precision[0]], 'recall' : [recall[0]], 'fscore' : [fscore[0]], 'support' : [support[0]]}, index = [0])])
    category_performances = pd.concat([category_performances, pd.DataFrame({'class' : [1], 'h' : [h],  'e' : [e], 'test_acc' : [test_acc],
                                                                            'tn' : cm[0,0], 'fp' : cm[0,1], 'fn' : cm[1,0], 'tp' : cm[1,1],
                                                  'precision' : [precision[1]], 'recall' : [recall[1]], 'fscore' : [fscore[1]], 'support' : [support[1]]}, index = [0])])
    category_performances = pd.concat([category_performances, pd.DataFrame({'class' : ['mean'], 'h' : [h], 'e' : [e], 'test_acc' : [test_acc],
                                                                            'tn' : cm[0,0], 'fp' : cm[0,1], 'fn' : cm[1,0], 'tp' : cm[1,1],
                                                  'precision' : [np.average(precision, weights = support)], 'recall' : [np.average(recall, weights = support)], 'fscore' : [np.average(fscore, weights = support)], 'support' : [np.average(support, weights = support)]}, index = [0])])
    return category_performances

In [None]:
for h in [0,1]:
  for e in [0,1]:
    X_test_input = pd.concat([X_test_pca, dy_test], axis = 1)
    X_test_input = X_test_input[(X_test_input['hedonic'] == h) & (X_test_input['experience'] == e)]
    category_performances = get_performance_metrics(h, e, X_test_input, category_performances, rf_classifier)

In [None]:
category_performances = get_performance_metrics(2, 2, pd.concat([X_test_pca, dy_test], axis = 1), category_performances, rf_classifier)

In [None]:
category_performances

In [None]:
category_performances.to_csv('/content/drive/MyDrive/Thesis DSS/balanced_data/Best models/imgonly_balanced/img_only_performance_per_category.csv', sep = ";")