In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train_dir = 'C:/Users/ROG/OneDrive/桌面/FYP/Dataset/Train_data/train_data_after_washing.csv'
test_dir = 'C:/Users/ROG/OneDrive/桌面/FYP/Dataset/Test_data/test_data_after_washing.csv'

In [3]:
data = pd.read_csv(train_dir)
data = pd.DataFrame(data)
test_data = pd.read_csv(test_dir)
test_data = pd.DataFrame(test_data)
data = data[["review","rating"]]
test_data = test_data[["review","rating"]]

In [4]:
data['labels'] = data['rating'].map({1 : 0,
                                     2 : 0,
                                     3 : 0,
                                     4 : 0,
                                     5 : 1,
                                     6 : 1,
                                     7 : 1,
                                     8 : 1,
                                     9 : 2,
                                     10 : 2})

In [5]:
test_data['labels'] = test_data['rating'].map({1 : 0,
                                     2 : 0,
                                     3 : 0,
                                     4 : 0,
                                     5 : 1,
                                     6 : 1,
                                     7 : 1,
                                     8 : 1,
                                     9 : 2,
                                     10 : 2})

In [6]:
data = data[["review","labels"]]
test_data = test_data[["review","labels"]]

In [7]:
def clean(text):
    wn = nltk.WordNetLemmatizer()
    stopword = nltk.corpus.stopwords.words('english') #remove useless words
    tokens = nltk.word_tokenize(text) #Tokenizers divide strings into lists of substrings
    lower = [word.lower() for word in tokens] #remove uppercase
    no_stopwords = [word for word in lower if word not in stopword]
    no_alpha = [word for word in no_stopwords if word.isalpha()]
    lemm_text = [wn.lemmatize(word) for word in no_alpha]
    clean_text = lemm_text
    return clean_text

In [8]:
def vectorize(data,tfidf_vect_fit):
    X_tfidf = tfidf_vect_fit.transform(data) #Transform doc to matrix 
    words = tfidf_vect_fit.get_feature_names() #Get features names
    X_tfidf_df = pd.DataFrame(X_tfidf.toarray())
    X_tfidf_df.columns = words
    return(X_tfidf_df)

In [9]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ROG\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
tfidf_vect = TfidfVectorizer(analyzer=clean,max_features=8000)
tfidf_vect_fit = tfidf_vect.fit(data['review']) #learn vocabulary from train set
X_train = vectorize(data['review'],tfidf_vect_fit)
y_train = data["labels"].to_numpy()
X_test = vectorize(test_data['review'],tfidf_vect_fit)
y_test = test_data["labels"].to_numpy()

In [11]:
def report_results(model, X, y, t, d):
    pred = model.predict(X)        
    acc = accuracy_score(y, pred)
    f1 = f1_score(y, pred,average='weighted')
    prec = precision_score(y, pred,average='weighted')
    rec = recall_score(y, pred,average='weighted')
    kappa = cohen_kappa_score(y, pred)
    result = {'f1': f1, 'acc': acc, 'precision': prec, 'recall': rec, 'kappa': kappa}
    
    con_mat = confusion_matrix(y, pred)
    con_mat_norm = con_mat.astype('float') / con_mat.sum(axis=1)[:, np.newaxis]     # 归一化
    con_mat_norm = np.around(con_mat_norm, decimals=2)
    plt.figure(figsize=(8, 8))
    sns.heatmap(con_mat_norm, annot=True, cmap='Blues')
    plt.ylim(0, 3)
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    
    direct='C:/Users/ROG/OneDrive/桌面/FYP/Model/W2V-RF/' + t + '_' + d + '.png'
    plt.savefig(fname=direct, dpi=300)
    plt.close()
    
    return result

In [15]:
import time
import json
from sklearn.metrics import cohen_kappa_score
for t in [10,20,50,100,200,1000]:
    for d in [10,50,100,300]:
        t0 = time.time()
        rf_classifier = RandomForestClassifier(n_estimators=t,max_depth=d)
        rf_classifier.fit(X_train, y_train)
        t1 = time.time()
        time_train = t1-t0
        time_train = str(time_train) + ' s'
        tree = str(t)
        depth = str(d)
        result = report_results(rf_classifier, X_test, y_test, tree, depth)
        text_file = open('C:/Users/ROG/OneDrive/桌面/FYP/Model/W2V-RF/'+tree+'_'+depth+ '.txt', 'w')
        text_file.write('time:'+ time_train + ';')
        text_file.write(json.dumps(result))
        text_file.close()   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
from sklearn.metrics import cohen_kappa_score
for t in [1000,2000]:
    for d in [10,50,100,300,500]:
        t0 = time.time()
        rf_classifier = RandomForestClassifier(n_estimators=t,max_depth=d)
        rf_classifier.fit(X_train, y_train)
        t1 = time.time()
        time_train = t1-t0
        time_train = str(time_train) + ' s'
        tree = str(t)
        depth = str(d)
        result = report_results(rf_classifier, X_test, y_test, tree, depth)
        text_file = open('C:/Users/ROG/OneDrive/桌面/FYP/Model/W2V-RF/'+tree+'_'+depth+ '.txt', 'w')
        text_file.write('time:'+ time_train + ';')
        text_file.write(json.dumps(result))
        text_file.close()   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [1]:
def test():
    x=1
    y=2
    z=3
    return (x, y, z)

In [2]:
a,b,c = test()
print(a,b,c)

1 2 3
