In [19]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from html.parser import HTMLParser
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string

from collections import defaultdict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
#from imblearn.under_sampling import RandomUnderSampler

import xgboost as xgb

from sklearn.metrics import classification_report

import time
#nltk.download('stopwords')
#nltk.download('punkt')

In [14]:
stop_words = list(set(stopwords.words('english')))
punct = list(set(string.punctuation))
punct.append('“')
punct.append('”')
stemmer = PorterStemmer()

def text_cleaning(text):
    # converting HTML character codes to ASCII code
    parser = HTMLParser()
    text =  parser.unescape(text)
    
    text = re.sub(r'<[^>]+>','',text) # removing HTML tags
    text = re.sub(r'(?:\#+[\w_]+[\w\'_\-]*[\w_]+)','',text) # removing hash-tags
    text = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+','',text) # removing URLs
    text = re.sub(r'(?:[\ufffd]+)','',text) #removing special characters
    text = word_tokenize(text)
    text = ' '.join(word for word in text if word not in punct) #remove punctuation
    text = re.sub('\n',' ',text) #remove new line
    text = re.sub('@','',text) #remove @ sign
    text = re.sub('\'','',text) # remove '
    text = text.lower() # lowercase all characters
    text = word_tokenize(text) # tonkenize words
    text = [i for i in text if not i in stop_words] #remove stop words
    #text = [stemmer.stem(word) for word in text]
    text = ' '.join(text)
    
    return text


In [16]:
#df_1 = pd.read_csv('/content/gdrive/My Drive/Amazon Seller Forum/cleaned_comments.csv') # Gabriel's cleaned comments
#df_2 = pd.read_csv('/content/gdrive/My Drive/Amazon Seller Forum/Amazon_Forum_scrape2.csv',encoding= 'unicode_escape') # Wei Huang's comments

df_1 = pd.read_csv('cleaned_comments.csv')
df_2 = pd.read_csv('Amazon_Forum_scrape2.csv',encoding= 'unicode_escape')

In [17]:
# Cleaning df_1
df_1.drop(['Unnamed: 0','Link','Reply Times'],axis=1,inplace=True)
df_1['Leading Comment'] = df_1['Leading Comment'].apply(lambda x: ' '.join(word for word in word_tokenize(x) if (word != '“') and (word != '”')))
df_1['Title'] = df_1['Title'].apply(lambda x: text_cleaning(x))
df_1["Publish Time"] = df_1["Publish Time"].apply(lambda x: pd.to_datetime(x))
df_1['Publish hour'] = df_1["Publish Time"].apply(lambda x: x.hour)
df_1['Reply Authors'] = df_1['Reply Authors'].apply(lambda x: word_tokenize(x))
df_1['Reply Authors'] = df_1['Reply Authors'].apply(lambda x: [word for word in x if word not in punct])
df_1['Reply Authors'] = df_1['Reply Authors'].apply(lambda x: [re.sub('\'','',x[i]) for i in range(len(x))])
df_1['Reply Comments'] = df_1['Reply Comments'].apply(lambda x: text_cleaning(x)) 
df_1['num_Reply_Authors'] = df_1['Reply Authors'].apply(lambda x: len(x))

# Cleaning df_2
df_2.drop('Unnamed: 0',axis=1,inplace=True)
df_2.dropna(axis=0,inplace=True)
df_2['Topic Title'] = df_2['Topic Title'].apply(lambda x: text_cleaning(x))
df_2['Category'] = df_2['Category'].apply(lambda x: x.strip())

df_2['Authors'] = df_2['Authors'].apply(lambda x: word_tokenize(x))
df_2['Authors'] = df_2['Authors'].apply(lambda x: [word for word in x if word not in punct])
df_2['Authors'] = df_2['Authors'].apply(lambda x: [re.sub('\'','',x[i]) for i in range(len(x))])
df_2['Reply Authors'] = df_2['Authors'].apply(lambda x: x[1:])
df_2['Authors'] = df_2['Authors'].apply(lambda x: x[0])

df_2['Leading Comment'] = df_2['Leading Comment'].apply(lambda x: text_cleaning(x))
df_2['Other Comments'] = df_2['Other Comments'].apply(lambda x: text_cleaning(x))
df_2.rename(columns={'Topic Title':'Title','Authors':'Post Author','Other Comments':'Reply Comments'},inplace=True)

  # Remove the CWD from sys.path while we load stuff.


In [18]:
df = pd.concat([df_1[['Title','Category','Post Author','Reply Authors','Leading Comment','Reply Comments']],
                df_2[['Title','Category','Post Author','Reply Authors','Leading Comment','Reply Comments']]],axis=0)
df['num_Reply_Authors'] = df['Reply Authors'].apply(lambda x: len(x))
#df.drop_duplicates(keep='first',inplace=True)
df.reset_index(drop=True,inplace=True)


## XGBoost

In [None]:
df['Leading Comment'] = df['Leading Comment'].apply(lambda x: word_tokenize(x))
df['Leading Comment'] = df['Leading Comment'].apply(lambda x: [stemmer.stem(word) for word in x])
df['Leading Comment'] = df['Leading Comment'].apply(lambda x: ' '.join(x))

df['Reply Comments'] = df['Reply Comments'].apply(lambda x: word_tokenize(x))
df['Reply Comments'] = df['Reply Comments'].apply(lambda x: [stemmer.stem(word) for word in x])
df['Reply Comments'] = df['Reply Comments'].apply(lambda x: ' '.join(x))

<b> Leading comments</b>

In [165]:
Encoder = LabelEncoder()

vectorizer = TfidfVectorizer(max_df=0.7,
                             max_features=6000,
                             use_idf=True,
                             ngram_range=(2,3))

vectorized_matrix = vectorizer.fit_transform(df['Leading Comment'])

X = pd.DataFrame.sparse.from_spmatrix(vectorized_matrix,
                                      columns=vectorizer.get_feature_names())

y = df['Category']
y = Encoder.fit_transform(y)

No Resampling

In [164]:
# Split dataset into training and testing
X_train,X_test,y_train,y_test = train_test_split(X.values,y,
                                                 test_size=0.3,
                                                 random_state=123)
# Initiate model
clf = xgb.XGBClassifier(max_depth = 10, 
                        n_estimators = 150, 
                        n_jobs = 3, 
                        colsample_bytree = 0.5,
                        gamma = 0.01,
                        objective='multi:softmax')

start_time = time.time()

# Fit the model
clf.fit(X_train,y_train)

# Predict
y_pred = clf.predict(X_test)

end_time = time.time()

run_time = end_time - start_time

# Display result
result = classification_report(y_test,y_pred)
print(result)

print('model run time: {} s'.format(np.round(run_time,2)))

              precision    recall  f1-score   support

           0       0.79      0.75      0.77       429
           1       0.63      0.26      0.37        93
           2       0.67      0.31      0.42       101
           3       0.62      0.46      0.53       180
           4       0.72      0.50      0.59        98
           5       0.68      0.56      0.62       461
           6       0.48      0.11      0.18       178
           7       0.73      0.28      0.40       176
           8       0.11      0.04      0.06        27
           9       0.83      0.45      0.59        64
          10       0.39      0.76      0.52       623

    accuracy                           0.55      2430
   macro avg       0.61      0.41      0.46      2430
weighted avg       0.61      0.55      0.54      2430

model run time: 1297.55 s


Resampling 

In [166]:
rus = RandomUnderSampler()
X_resampled,y_resampled = rus.fit_resample(X,y)

X_train,X_test,y_train,y_test = train_test_split(X_resampled,y_resampled,
                                                 test_size=0.3,
                                                 random_state=123)

clf = xgb.XGBClassifier(max_depth = 10, 
                        n_estimators = 150, 
                        n_jobs = 3, 
                        colsample_bytree = 0.5,
                        gamma = 0.01,
                        objective='multi:softmax')

start_time = time.time()

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

end_time = time.time()

run_time = end_time - start_time

result = classification_report(y_test,y_pred)
print(result)

print('model run time: {} s'.format(np.round(run_time,2)))




              precision    recall  f1-score   support

           0       0.40      0.43      0.42        23
           1       0.39      0.36      0.37        25
           2       0.36      0.33      0.35        27
           3       0.50      0.28      0.36        36
           4       0.50      0.42      0.46        26
           5       0.24      0.35      0.29        20
           6       0.16      0.46      0.24        24
           7       0.43      0.33      0.38        27
           8       0.29      0.17      0.21        24
           9       0.62      0.50      0.56        30
          10       0.22      0.16      0.18        32

    accuracy                           0.34       294
   macro avg       0.37      0.34      0.35       294
weighted avg       0.38      0.34      0.35       294

model run time: 173.55 s


<b> all comments </b>

In [22]:
corpus = df['Leading Comment'] + df['Reply Comments']

In [24]:
Encoder = LabelEncoder()

vectorizer = TfidfVectorizer(max_df=0.7,
                             max_features=6000,
                             use_idf=True,
                             ngram_range=(2,3))

vectorized_matrix = vectorizer.fit_transform(corpus)

X = pd.DataFrame.sparse.from_spmatrix(vectorized_matrix,
                                      columns=vectorizer.get_feature_names())

y = df['Category']
y = Encoder.fit_transform(y)

In [28]:
print('XGBoost classifier Benchmark')
clf = xgb.XGBClassifier(objective='multi:softmax')

start_time = time.time()

# Fit the model
clf.fit(X_train,y_train)

# Predict
y_pred = clf.predict(X_test)

end_time = time.time()

run_time = end_time - start_time

# Display result
result = classification_report(y_test,y_pred)
print(result)

print('model run time: {} s'.format(np.round(run_time,2)))

XGBoost classifier Benchmark
              precision    recall  f1-score   support

           0       0.84      0.87      0.86       429
           1       0.67      0.35      0.46        93
           2       0.73      0.48      0.57       101
           3       0.64      0.48      0.55       180
           4       0.72      0.54      0.62        98
           5       0.73      0.61      0.66       461
           6       0.59      0.30      0.40       178
           7       0.79      0.56      0.66       176
           8       0.00      0.00      0.00        27
           9       0.90      0.69      0.78        64
          10       0.47      0.76      0.58       623

    accuracy                           0.64      2430
   macro avg       0.64      0.51      0.56      2430
weighted avg       0.66      0.64      0.63      2430

model run time: 2312.28 s


In [26]:
X_train,X_test,y_train,y_test = train_test_split(X.values,y,
                                                 test_size=0.3,
                                                 random_state=123)
# Initiate model
clf = xgb.XGBClassifier(max_depth = 10, 
                        n_estimators = 150, 
                        n_jobs = 3, 
                        colsample_bytree = 0.5,
                        gamma = 0.01,
                        objective='multi:softmax')

start_time = time.time()

# Fit the model
clf.fit(X_train,y_train)

# Predict
y_pred = clf.predict(X_test)

end_time = time.time()

run_time = end_time - start_time

# Display result
result = classification_report(y_test,y_pred)
print(result)

print('model run time: {} s'.format(np.round(run_time,2)))

              precision    recall  f1-score   support

           0       0.85      0.88      0.86       429
           1       0.59      0.39      0.47        93
           2       0.61      0.43      0.50       101
           3       0.63      0.51      0.56       180
           4       0.68      0.52      0.59        98
           5       0.69      0.64      0.66       461
           6       0.48      0.33      0.39       178
           7       0.78      0.56      0.65       176
           8       0.14      0.04      0.06        27
           9       0.88      0.66      0.75        64
          10       0.48      0.70      0.57       623

    accuracy                           0.63      2430
   macro avg       0.62      0.51      0.55      2430
weighted avg       0.64      0.63      0.62      2430

model run time: 3027.77 s
