In [1]:
!pip install scikit-learn matplotlib numpy pandas xgboost scikit-surprise recommenders fastFM==0.2.9 --quiet

from google.colab import drive
import numpy as np
import matplotlib.pyplot as plt
import surprise
import pandas as pd
from fastFM import als, mcmc, sgd
from collections import defaultdict
from surprise import Reader, Dataset, SVD, BaselineOnly, KNNBaseline
from surprise.accuracy import mse
from recommenders.datasets.python_splitters import python_stratified_split
from surprise.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, roc_auc_score, precision_score, classification_report
from sklearn.model_selection import GridSearchCV as GridSearchCVSK
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier

plt.rcParams['figure.figsize'] = (12, 12)
drive.mount("/content/drive")
%cd "/content/drive/MyDrive/CSE258Assignment2"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/CSE258Assignment2


# Data

In [2]:
trainData = pd.read_csv('drugsComTrain_raw.tsv', sep='\t', header=0)
testData = pd.read_csv('drugsComTest_raw.tsv', sep='\t', header=0)

In [3]:
conditionInds = np.where(trainData.condition.value_counts()> 50)[0]
conditions = list(trainData.condition.value_counts()[conditionInds].index)
conditions = [cond for cond in conditions if cond[-4:] != 'ful.']

drugInds = np.where(trainData.drugName.value_counts()> 50)[0]
drugs = list(trainData.drugName.value_counts()[drugInds].index)

def extract_year(date):
    temp = date.split()
    return temp[-1]

def extract_day(date):
    temp = date.split()
    return temp[1][:-1]

def extract_month(date):
    temp = date.split()
    return temp[0]

def preprocess_data(df):
    df.drop(df.columns[0], axis = 1, inplace=True)

    # Process review to length of review and drop all text
    df['reviewLen'] = df.apply(lambda row: len(row.review), axis = 1)
    df.drop('review', axis = 1, inplace=True)

    # Process date by splitting it to year, month, day
    df['year'] = df.apply(lambda row: extract_year(row.date), axis = 1)
    df['day'] = df.apply(lambda row: extract_day(row.date), axis = 1)
    df['month'] = df.apply(lambda row: extract_month(row.date), axis = 1)
    df.drop('date', axis = 1, inplace=True)

    # Process condition to get rid of unwanted values in conditions and drugs
    df = df[(df['condition'].isin(conditions)) & (df['drugName'].isin(drugs))]

    return df.drop('rating', axis = 1).reset_index(drop=True), df['rating'].values.astype(np.int)

In [4]:
x_train, y_train = preprocess_data(trainData)
x_test, y_test = preprocess_data(testData)

## Exploratory analysis

In [5]:
"""values, counts = np.unique(y_train, return_counts=True)
plt.bar(values, counts, tick_label = values, width = 0.5)
plt.xlabel("Ratings")
plt.ylabel("Counts")
plt.savefig('TrainRatings')
plt.show()""";

In [6]:
"""values, counts = np.unique(y_test, return_counts=True)
plt.bar(values, counts, tick_label = values, width = 0.5)
plt.xlabel("Ratings")
plt.ylabel("Counts")
plt.savefig('TestRatings')
plt.show()""";

In [7]:
"""values, counts = np.unique(x_train.year, return_counts=True)
plt.bar(values, counts, tick_label = values, width = 0.5)
plt.xlabel("Years")
plt.ylabel("Counts")
plt.savefig('TrainYear')
plt.show()""";

In [8]:
"""values, counts = np.unique(x_train.month, return_counts=True)
plt.bar(values, counts, tick_label = values, width = 0.5)
plt.xlabel("Months")
plt.ylabel("Counts")
plt.savefig('TrainMonth')
plt.show()""";

In [9]:
"""values, counts = np.unique(x_train.day, return_counts=True)
valuesSorted = values.astype(np.int).argsort()
plt.bar(values[valuesSorted], counts[valuesSorted], tick_label = values[valuesSorted], width = 0.5)
plt.xlabel("Days")
plt.ylabel("Counts")
plt.savefig('TrainDays')
plt.show()""";

In [10]:
"""top15 = trainData.condition.value_counts()[:15]
plt.bar(top15.index, top15.values, tick_label = top15.index, width = 0.5)
plt.xlabel("Conditions")
plt.xticks(rotation=45)
plt.ylabel("Counts")
plt.title("Top 15 conditions")
plt.savefig('top15conds')
plt.show()""";

In [11]:
"""withRatings = x_train[:]
withRatings['rating'] = y_train[:]
highRatings = withRatings[withRatings['rating']>7].reset_index(drop=True)

lowRatings = withRatings[withRatings['rating']<4].reset_index(drop=True)

middleRatings = withRatings[(withRatings['rating'] >3) & (withRatings['rating']<8)].reset_index(drop=True)""";

In [12]:
"""top15 = middleRatings.year.value_counts()
plt.bar(top15.index, top15.values, tick_label = top15.index, width = 0.5)
plt.xlabel("Years")
plt.xticks(rotation=45)
plt.ylabel("Counts")
plt.title("Top years for middle ratings")
plt.savefig('top15yearMiddle')
plt.show()""";

In [13]:
"""top15 = lowRatings.year.value_counts()
plt.bar(top15.index, top15.values, tick_label = top15.index, width = 0.5)
plt.xlabel("Years")
plt.xticks(rotation=45)
plt.ylabel("Counts")
plt.title("Top years for low ratings")
plt.savefig('top15yearLow')
plt.show()""";

In [14]:
"""top15 = highRatings.year.value_counts()
plt.bar(top15.index, top15.values, tick_label = top15.index, width = 0.5)
plt.xlabel("Years")
plt.xticks(rotation=45)
plt.ylabel("Counts")
plt.title("Top years for high ratings")
plt.savefig('top15yearHigh')
plt.show()""";

## Feature importance

In [15]:
numAttr = ['usefulCount', 'reviewLen']
catAttr = ['drugName', 'condition', 'year', 'month']

transformer = ColumnTransformer([
                              ('num', StandardScaler(), numAttr),
                              ('day', OrdinalEncoder(), ['day']),
                              ('cat', OneHotEncoder(handle_unknown='ignore'), catAttr)
])

x_train_prepared = transformer.fit_transform(x_train)
x_test_prepared = transformer.transform(x_test)

trainPreparedFeatureNames = np.array(numAttr + ['day'] + list(transformer.named_transformers_['cat'].get_feature_names(catAttr)))
y_ticks = np.arange(0, len(trainPreparedFeatureNames))

In [16]:
def get_fea_imp(model, title = "", outImp = ""):
    model.fit(x_train_prepared, y_train)
    modImp = model.feature_importances_
    modelSortedInd = modImp.argsort()

    fig, ax = plt.subplots(figsize=(12,12))
    ax.barh(y_ticks[-20:], modImp[modelSortedInd][-20:])
    ax.set_yticklabels(trainPreparedFeatureNames[modelSortedInd][-20:])
    ax.set_yticks(y_ticks[-20:])
    ax.set_title(title)
    fig.tight_layout()
    fig.savefig(outImp)

    plt.show()

### Random forest regression feature importance

In [17]:
"""rfReg = RandomForestRegressor(n_jobs=-1, random_state=42)
get_fea_imp(rfReg, "Random Forest Regressor Feature Importance", 'rfRegFeaImp.png')""";

### Random forest classification feature importance

In [18]:
"""rfClf = RandomForestClassifier(n_jobs=-1, random_state=42)
get_fea_imp(rfClf, "Random Forest Classifier Feature Importance", 'rfClfFeaImp.png')""";

### XGBoost regression feature importance

In [19]:
"""xgbReg = XGBRegressor()
get_fea_imp(xgbReg, "XGB Regressor Feature Importance", 'xgbRegFeaImp.png')""";

### XGBoost classification feature importance

In [20]:
"""xgbClf = XGBClassifier()
get_fea_imp(xgbClf, "XGB Classifier Feature Importance", 'xgbClfFeaImp.png')""";

# Surprise model

## Data

In [21]:
dfTrVal = x_train[["condition", "drugName"]] 
dfTrVal['rating'] = y_train

dfTr, dfVal = python_stratified_split(
    dfTrVal,
    ratio=0.8, 
    min_rating=1, 
    filter_by="user", 
    col_user='condition', 
    col_item='drugName'
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [22]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(dfTrVal[['condition', 'drugName', 'rating']], reader)
trainset = data.build_full_trainset()

dataVal = Dataset.load_from_df(dfVal[['condition', 'drugName', 'rating']], reader)

## SVD

In [23]:
"""parameters = {
    "n_factors": [5, 10, 15, 20],
    "n_epochs": [10, 15, 20, 30],
    "lr_all": [0.001, 0.005, 0.007, 0.01],
    "reg_all": [0.005, 0.01, 0.02, 0.04, 0.08]
}

gridSVD = GridSearchCV(SVD, param_grid=parameters, n_jobs=-1, refit=True, cv=4, measures=["rmse"], joblib_verbose=4)

gridSVD.fit(dataVal)

trainPara = gridSVD.best_params['mse']
trainPara""";

In [24]:
trainPara = {'lr_all': 0.005, 'n_epochs': 10, 'n_factors': 5, 'reg_all': 0.08}

algoSVD = SVD(n_factors = trainPara['n_factors'], n_epochs = trainPara['n_epochs'],
                 lr_all = trainPara['lr_all'], reg_all = trainPara['reg_all'])

algoSVD.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f1f4b0ebf90>

In [25]:
predSVD = []
for cond, drug in list(zip(x_test.condition, x_test.drugName)):
    temp = algoSVD.predict(cond, drug)
    predSVD.append(temp.est)

## BaselineOnly

### ALS

In [26]:
"""param_grid = {'verbose': [False],
    'bsl_options': {'method': ['als'],
                    'reg_i': [5, 10, 15, 20],
                    'reg_u': [10, 15, 20, 30],
                    'n_epochs': [10, 15, 20, 30]}
              }


gridALS = GridSearchCV(BaselineOnly, param_grid=param_grid, n_jobs=-1, refit=True, cv=4, measures=["rmse"], joblib_verbose=4)

gridALS.fit(dataVal)

trainPara = gridALS.best_params['mse']
trainPara""";

In [27]:
trainPara = {'bsl_options': {'method': 'als', 'n_epochs': 15, 'reg_i': 10, 'reg_u': 20},
 'verbose': False}

algoALS = BaselineOnly(bsl_options = trainPara['bsl_options'])

algoALS.fit(trainset)

predALS = []
for cond, drug in list(zip(x_test.condition, x_test.drugName)):
    temp = algoALS.predict(cond, drug)
    predALS.append(temp.est)

Estimating biases using als...


### SGD

In [28]:
"""param_grid = {'verbose': [False],
    'bsl_options': {'method': ['sgd'],
                    'reg': [0.005, 0.01, 0.02, 0.05, 0.1],
                    'learning_rate': [0.001, 0.005, 0.007, 0.01],
                    'n_epochs': [10, 15, 20, 30]
                    }
              }

gridSGD = GridSearchCV(BaselineOnly, param_grid=param_grid, n_jobs=-1, refit=True, cv=4, measures=["rmse"])

gridSGD.fit(dataVal)

trainPara = gridSGD.best_params['mse']
trainPara""";

In [29]:
trainPara = {'bsl_options': {'learning_rate': 0.005,
  'method': 'sgd',
  'n_epochs': 15,
  'reg': 0.1},
 'verbose': False}

algoSGD = BaselineOnly(bsl_options = trainPara['bsl_options'])

algoSGD.fit(trainset)

predSGD = []
for cond, drug in list(zip(x_test.condition, x_test.drugName)):
    temp = algoSGD.predict(cond, drug)
    predSGD.append(temp.est)

Estimating biases using sgd...


## KNN

In [30]:
"""param_grid = {'k': [40, 50, 200, 500, 1000, 1500],
              'verbose': [False],
              'sim_options': {'name': ['pearson_baseline']},
              'bsl_options': {'method': ['sgd'],
                            'learning_rate': [0.001, 0.002, 0.005, 0.007, 0.01],
                            'n_epochs': [10, 15, 20, 30, 50],
                            'reg': [0.01, 0.02, 0.05, 0.07, 0.1]}
              }

gridKNN = GridSearchCV(KNNBaseline, param_grid, measures=['rmse'], cv=4)

gridKNN.fit(dataVal)

trainPara = gridKNN.best_params['rmse']
trainPara""";

In [31]:
trainPara = {'bsl_options': {'learning_rate': 0.001,
  'method': 'sgd',
  'n_epochs': 10,
  'reg': 0.1},
 'k': 1000,
 'sim_options': {'name': 'pearson_baseline', 'user_based': True},
 'verbose': False}

algoKNN = KNNBaseline(k = trainPara['k'], bsl_options = trainPara['bsl_options'])

algoKNN.fit(trainset)

predKNN = []
for cond, drug in list(zip(x_test.condition, x_test.drugName)):
    temp = algoKNN.predict(cond, drug)
    predKNN.append(temp.est)

Estimating biases using sgd...
Computing the msd similarity matrix...
Done computing similarity matrix.


### Scores (RMSE, MAE, R2)

In [32]:
def return_score(truth, pred):
    return mean_squared_error(truth, pred, squared = False), mean_absolute_error(truth, pred), r2_score(truth, pred)

In [33]:
return_score(y_test, predSVD)

(3.0691742064413994, 2.5473226325264764, 0.12876051629344842)

In [34]:
return_score(y_test, predALS)

(3.066862984458325, 2.554696872329258, 0.1300721848642723)

In [35]:
return_score(y_test, predSGD)

(3.074845209645341, 2.5595736594399483, 0.12553791229435307)

In [36]:
return_score(y_test, predKNN)

(3.064171334831537, 2.5422723068189663, 0.13159850890125224)

# Classification models

## KNN

In [37]:
"""knnParam = {
    'n_neighbors': range(5, 141, 15),
    'weights': ['uniform', 'distance'],
    'n_jobs': [-1]
}

knnClf = KNeighborsClassifier()

gridsKNN = GridSearchCVSK(KNeighborsClassifier(), knnParam, cv = 4, scoring = 'accuracy', verbose = 2)
gridsKNN.fit(x_train_prepared, y_train)
gridsKNN.best_params_
""";

In [38]:
trainPara = {'n_jobs': -1, 'n_neighbors': 5, 'weights': 'distance'}
knnClf = KNeighborsClassifier(n_neighbors= trainPara['n_neighbors'], weights= trainPara['weights'], n_jobs = -1)
knnClf.fit(x_train_prepared, y_train)
predKNNClf = knnClf.predict(x_test_prepared)

In [39]:
def return_clf_score(truth, pred):
    print('The accuracy is: ', accuracy_score(truth, pred))
    print()
    print(classification_report(truth, pred))

In [40]:
return_clf_score(y_test, predKNNClf)

The accuracy is:  0.3591380325048667

              precision    recall  f1-score   support

           1       0.37      0.41      0.39      6048
           2       0.20      0.16      0.17      1977
           3       0.20      0.15      0.17      1883
           4       0.16      0.12      0.13      1415
           5       0.21      0.16      0.18      2280
           6       0.19      0.13      0.16      1779
           7       0.20      0.15      0.17      2600
           8       0.27      0.24      0.26      5102
           9       0.34      0.33      0.34      7587
          10       0.48      0.58      0.52     13507

    accuracy                           0.36     44178
   macro avg       0.26      0.24      0.25     44178
weighted avg       0.34      0.36      0.35     44178



## XGBoost

In [41]:
"""xgbParam = {
        'min_child_weight': [1, 5],
        'subsample': [0.6, 0.8, 1.0],
        'n_estimators': [100, 500, 1000],
        'max_depth': [3, 4, 5, 6, 7, 8],
        'eta': [0.1, 0.2, 0.3]
        }

xgbClf = XGBClassifier()

gridsXGB = RandomizedSearchCV(xgbClf, xgbParam, cv = 4, scoring = 'accuracy', verbose = 2, random_state=42)
gridsXGB.fit(x_train_prepared, y_train)
gridsXGB.best_params_""";

In [42]:
trainPara = {
        'n_jobs': -1, 
        'min_child_weight': 1,
        'subsample': 0.8,
        'n_estimators': 500,
        'max_depth': 7,
        'eta': 0.1
        }

xgbClf = XGBClassifier(min_child_weight = trainPara['min_child_weight'], n_jobs = -1, subsample= trainPara['subsample'],
                       n_estimators = trainPara['n_estimators'], max_depth = trainPara['max_depth'], eta = trainPara['eta'])
xgbClf.fit(x_train_prepared, y_train)
predXGBClf = xgbClf.predict(x_test_prepared)

In [43]:
return_clf_score(y_test, predXGBClf)

The accuracy is:  0.35925121101000496

              precision    recall  f1-score   support

           1       0.32      0.55      0.41      6048
           2       0.23      0.01      0.03      1977
           3       0.25      0.02      0.04      1883
           4       0.23      0.01      0.03      1415
           5       0.21      0.03      0.05      2280
           6       0.30      0.03      0.05      1779
           7       0.23      0.03      0.05      2600
           8       0.23      0.07      0.11      5102
           9       0.25      0.15      0.19      7587
          10       0.40      0.80      0.54     13507

    accuracy                           0.36     44178
   macro avg       0.27      0.17      0.15     44178
weighted avg       0.30      0.36      0.27     44178



# Classification model for low, medium, high

## Data

In [44]:
pd.options.mode.chained_assignment = None

withRatingsTr = x_train[['drugName', 'condition', 'usefulCount']]
withRatingsTr['rating'] = y_train
withRatingsT = x_test[['drugName', 'condition', 'usefulCount']]
withRatingsT['rating'] = y_test

In [45]:
withRatingsTr.loc[withRatingsTr['rating'] <4, 'rating'] = -1
withRatingsTr.loc[(withRatingsTr['rating'] >=4) & (withRatingsTr['rating'] <=7), 'rating'] = 0
withRatingsTr.loc[withRatingsTr['rating'] >7, 'rating'] = 1

withRatingsT.loc[withRatingsT['rating'] <4, 'rating'] = -1
withRatingsT.loc[(withRatingsT['rating'] >=4) & (withRatingsT['rating'] <=7), 'rating'] = 0
withRatingsT.loc[withRatingsT['rating'] >7, 'rating'] = 1

In [46]:
catAttrClf = ['drugName', 'condition']

transformerClf = ColumnTransformer([
                              ('num', StandardScaler(), ['usefulCount']),
                              ('cat', OneHotEncoder(handle_unknown='ignore'), catAttrClf)
])

xClf_train, yClf_train = withRatingsTr.drop('rating', axis = 1), withRatingsTr['rating'].values.astype(np.int)
xClf_test, yClf_test = withRatingsT.drop('rating', axis = 1), withRatingsT['rating'].values.astype(np.int)

xClf_train_prepared = transformerClf.fit_transform(xClf_train)
xClf_test_prepared = transformerClf.transform(xClf_test)

## KNN

In [48]:
"""knnParam2 = {
    'n_neighbors': range(5, 30, 4),
    'weights': ['uniform', 'distance'],
    'n_jobs': [-1]
}

knnClf2 = KNeighborsClassifier()

gridsKNN2 = GridSearchCVSK(knnClf2, knnParam2, cv = 4, scoring = 'accuracy', verbose = 2)
gridsKNN2.fit(xClf_train_prepared, yClf_train)
gridsKNN2.best_params_""";

In [49]:
trainPara = {'n_jobs': -1, 'n_neighbors': 29, 'weights': 'uniform'}
knnClf2 = KNeighborsClassifier(n_neighbors= trainPara['n_neighbors'], weights= trainPara['weights'], n_jobs = -1)
knnClf2.fit(xClf_train_prepared, yClf_train)
predKNNClf2 = knnClf2.predict(xClf_test_prepared)

In [50]:
return_clf_score(yClf_test, predKNNClf2)

The accuracy is:  0.6179093666530853

              precision    recall  f1-score   support

          -1       0.45      0.39      0.42      9908
           0       0.32      0.07      0.11      8074
           1       0.67      0.87      0.76     26196

    accuracy                           0.62     44178
   macro avg       0.48      0.44      0.43     44178
weighted avg       0.56      0.62      0.57     44178



## XGBoost

In [51]:
"""xgbParam2 = {
        'min_child_weight': [1, 5],
        'subsample': [0.6, 0.8, 1.0],
        'n_estimators': [100, 500, 1000],
        'max_depth': [3, 4, 5, 6, 7, 8],
        'eta': [0.1, 0.2, 0.3]
        }

xgbClf2 = XGBClassifier()

gridsXGB2 = RandomizedSearchCV(xgbClf2, xgbParam2, cv = 4, scoring = 'accuracy', verbose = 2, random_state=42)
gridsXGB2.fit(xClf_train_prepared, yClf_train)
gridsXGB2.best_params_""";

In [52]:
trainPara = {
        'n_jobs': -1, 
        'min_child_weight': 1,
        'subsample': 1,
        'n_estimators': 1000,
        'max_depth': 7,
        'eta': 0.2
        }

xgbClf2 = XGBClassifier(min_child_weight = trainPara['min_child_weight'], n_jobs = -1, subsample= trainPara['subsample'],
                       n_estimators = trainPara['n_estimators'], max_depth = trainPara['max_depth'], eta = trainPara['eta'])
xgbClf2.fit(xClf_train_prepared, yClf_train)
predXGBClf2 = xgbClf2.predict(xClf_test_prepared)

In [53]:
return_clf_score(yClf_test, predXGBClf2)

The accuracy is:  0.6201050296527684

              precision    recall  f1-score   support

          -1       0.49      0.29      0.37      9908
           0       0.33      0.03      0.06      8074
           1       0.65      0.92      0.76     26196

    accuracy                           0.62     44178
   macro avg       0.49      0.42      0.40     44178
weighted avg       0.55      0.62      0.55     44178

