<a href="https://colab.research.google.com/github/QasimKhan5x/NLP-for-Business/blob/main/NLP_A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import auth
auth.authenticate_user()

Download the data from google spreadsheets

In [None]:
import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())
wb = gc.open_by_url('https://docs.google.com/spreadsheets/d/1lInyxuxoW8KDKOLmrmVRuEPSNDHWXOZ6mx0LDWPzCXc')
sheet = wb.worksheet('game_reviews')
data = sheet.get_all_values()

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.DataFrame(data)
df.columns = df.iloc[0]
df = df.iloc[1:]
df.head()

Unnamed: 0,author age,recommended age,review
1,"Teen, 17 years old",age 7+,Kids dont listent to the parents saying ''ThIs...
2,"Kid, 11 years old",age 2+,I have been playing this game for many years a...
3,"Kid, 12 years old",age 7+,The game is great with no true inappropriate t...
4,"Teen, 13 years old",age 5+,Are you sure you got common sense. I would giv...
5,"Kid, 10 years old",age 5+,IDK WHAT TO SAY BUT DIS IS DA BEST GAME EVA


Featurize the 'author age' column and then drop it

In [None]:
df['author_age_cat'] = df['author age'].str.split(', ', expand=True).loc[:, 0]
df['author_age'] = df['author age'].str.split(', ', expand=True).loc[:, 1].str.extract('(\d{2})')
df['author_age'] = pd.to_numeric(df['author_age'])
df.drop('author age', axis=1, inplace=True)
df.head()

Unnamed: 0,recommended age,review,author_age_cat,author_age
1,age 7+,Kids dont listent to the parents saying ''ThIs...,Teen,17.0
2,age 2+,I have been playing this game for many years a...,Kid,11.0
3,age 7+,The game is great with no true inappropriate t...,Kid,12.0
4,age 5+,Are you sure you got common sense. I would giv...,Teen,13.0
5,age 5+,IDK WHAT TO SAY BUT DIS IS DA BEST GAME EVA,Kid,10.0


Re-order the columns

In [None]:
cols = df.columns.tolist()
cols = cols[2:] + cols[:2]
df = df[cols]
df.head()

Unnamed: 0,author_age_cat,author_age,recommended age,review
1,Teen,17.0,age 7+,Kids dont listent to the parents saying ''ThIs...
2,Kid,11.0,age 2+,I have been playing this game for many years a...
3,Kid,12.0,age 7+,The game is great with no true inappropriate t...
4,Teen,13.0,age 5+,Are you sure you got common sense. I would giv...
5,Kid,10.0,age 5+,IDK WHAT TO SAY BUT DIS IS DA BEST GAME EVA


Classify review as safe if the recommended age is less than 10

In [None]:
rec_ages = pd.to_numeric(df['recommended age'].str.extract('(\d+)').loc[:, 0])
df['safe'] = rec_ages.apply(lambda age: 0 if age >= 10 else 1)
df.head()

Unnamed: 0,author_age_cat,author_age,recommended age,review,safe
1,Teen,17.0,age 7+,Kids dont listent to the parents saying ''ThIs...,1
2,Kid,11.0,age 2+,I have been playing this game for many years a...,1
3,Kid,12.0,age 7+,The game is great with no true inappropriate t...,1
4,Teen,13.0,age 5+,Are you sure you got common sense. I would giv...,1
5,Kid,10.0,age 5+,IDK WHAT TO SAY BUT DIS IS DA BEST GAME EVA,1


In [None]:
y = df['safe'].values

In [None]:
reviews = df['review'].values
reviews

## NLP Part

In [144]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import re

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer

import gensim
from gensim.parsing.preprocessing import remove_stopwords

# library that contains punctuation
import string

In [225]:
lower_vectorized = np.vectorize(lambda x: x.lower().strip())
lower_vectorized.__name__= "lower, strip"
reviews_lower = lower_vectorized(reviews)

In [226]:
def remove_words(text):
    # remove url
    text = re.sub(r'http[^\s]+', '', text)
    # remove non word
    text = re.sub(r'[^a-zA-Z ]', '', text)
    # remove long words
    text = re.sub(r'\w{20,}', '', text)
    # remove minecraft word
    text = re.sub(r'minecraft', '', text, flags=re.IGNORECASE)
    # remove multiple spaces
    return re.sub(r' +', ' ', text).strip()

remove_words_vectorized = np.vectorize(remove_words)
remove_words_vectorized.__name__= "garbage remover"
reviews_pp = remove_words_vectorized(reviews_lower)

In [227]:
remove_stopwords_vectorized = np.vectorize(remove_stopwords)
remove_stopwords_vectorized.__name__= "stopwords remover"
reviews_without_sw = remove_stopwords_vectorized(reviews_pp)

In [228]:
punc = string.punctuation + '“”’━•'

# defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree= "".join([i for i in text if i not in punc])
    return punctuationfree

remove_punctuation_vectorized = np.vectorize(remove_punctuation)
remove_punctuation_vectorized.__name__= "punctuation remover"
reviews_without_punc = remove_punctuation_vectorized(reviews_without_sw)

In [229]:
reviews_pp = remove_words_vectorized(reviews_without_punc)

In [230]:
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer('english')

In [245]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


def lemmatize(word):
    pos = get_wordnet_pos(word)
    return lemmatizer.lemmatize(word, pos)

def stem(word):
    return stemmer.stem(word)


def preprocess(text):
    result = []
    tokens = gensim.utils.simple_preprocess(text)
    for token in tokens:
        if token not in gensim.parsing.preprocessing.STOPWORDS:
            result.append(lemmatize(token))
    return ' '.join(result)

In [246]:
stemmer_vectorized = np.vectorize(preprocess)
stemmer_vectorized.__name__= "lemmatize and stem"
reviews_preprocessed = stemmer_vectorized(reviews_pp)

In [247]:
X_reviews = np.array([])
y_labels = np.array([])

for i, review in enumerate(reviews_preprocessed):
    if review != '':
        X_reviews = np.append(X_reviews, review)
        y_labels = np.append(y_labels, y[i])

In [248]:
X_reviews.shape, y_labels.shape

((523,), (523,))

## ML Part

In [249]:
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.model_selection import KFold, RandomizedSearchCV

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [250]:
def crossvalidate_classifier(model, X, y, cm=False):
    
    scores = cross_val_score(model, X, y, scoring='f1_macro', cv=5)
    print("%0.2f f-1 score with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
    y_pred = cross_val_predict(model, X, y, cv=5)

    if(cm):
        conf_mat = confusion_matrix(y, y_pred, labels=['fulfillment','other'])
        metrics.plot_confusion_matrix(conf_mat, classes = ['fulfillment','other'])
    
def evaluate_classifier(model, X_train, X_test, y_train, y_test):
      
    # ... fit your model here ...
    model.fit(X_train,y_train)

    # Run predict on your tfidf test data to get your predictions
    pred = model.predict(X_test)

    # Calculate your accuracy using the metrics module
    acc_score = metrics.accuracy_score(pred, y_test)
    print("Accuracy Score:   %0.3f" % acc_score)
    
    f1score = metrics.f1_score(pred, y_test, average='macro')
    print("F-1 Score:   %0.3f" % f1score)
    
    return model

def best_hyperparam(X_train_data, X_test_data, y_train_data, y_test_data, 
                       model, param_grid, cv=5, scoring_fit='f1_score',
                       do_probabilities = False):
    gs = RandomizedSearchCV(
        estimator=model,
        param_grid=param_grid, 
        cv=cv, 
        n_jobs=-1, 
        scoring=scoring_fit,
        verbose=2
    )
    fitted_model = gs.fit(X_train_data, y_train_data, epochs=100)
    
    if do_probabilities:
      pred = fitted_model.predict_proba(X_test_data)
    else:
      pred = fitted_model.predict(X_test_data)
    
    return fitted_model, pred

In [251]:
# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_reviews, y_labels, 
                                                    random_state=42, 
                                                    test_size=0.25,
                                                    shuffle=True, 
                                                    stratify=y_labels)

In [252]:
# Initialize count vectorizer
count_vectorizer = CountVectorizer(stop_words='english', max_df=0.9, min_df=0.05)

# Create count train and test variables
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

# Initialize tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=0.05)

# Create tfidf train and test variables
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [253]:
print(count_train.shape, tfidf_train.shape)

(392, 85) (392, 85)


In [254]:
print(count_vectorizer.vocabulary_)

{'great': 29, 'little': 41, 'encourages': 22, 'violence': 77, 'kill': 35, 'animal': 2, 'monster': 46, 'red': 60, 'parent': 52, 'like': 40, 'violent': 78, 'thats': 69, 'game': 26, 'food': 23, 'building': 9, 'blood': 7, 'dont': 18, 'good': 27, 'learn': 37, 'survival': 66, 'build': 8, 'world': 81, 'mode': 45, 'fun': 25, 'bad': 3, 'im': 31, 'kid': 34, 'start': 64, 'play': 54, 'creativity': 16, 'doesnt': 17, 'people': 53, 'want': 79, 'thing': 70, 'year': 82, 'use': 75, 'age': 0, 'recommend': 59, 'think': 71, 'block': 6, 'edition': 20, 'come': 13, 'love': 43, 'server': 63, 'lot': 42, 'online': 51, 'chat': 11, 'inappropriate': 32, 'cool': 14, 'let': 38, 'best': 4, 'turn': 73, 'stuff': 65, 'pocket': 57, 'way': 80, 'buy': 10, 'know': 36, 'time': 72, 'played': 55, 'life': 39, 'creative': 15, 'easy': 19, 'sword': 68, 'child': 12, 'say': 61, 'need': 48, 'multiplayer': 47, 'mob': 44, 'amaze': 1, 'version': 76, 'old': 50, 'real': 58, 'new': 49, 'update': 74, 'educational': 21, 'friend': 24, 'gore': 

In [255]:
rf_model = RandomForestClassifier()
fitted_model_bow = evaluate_classifier(rf_model, count_train, count_test, y_train, y_test)

Accuracy Score:   0.870
F-1 Score:   0.595


In [256]:
rf_model = RandomForestClassifier()
fitted_model_tfidf = evaluate_classifier(rf_model, tfidf_train, tfidf_test, y_train, y_test)

Accuracy Score:   0.893
F-1 Score:   0.653


In [257]:
svm_model = LinearSVC()
fitted_model_bow = evaluate_classifier(svm_model, count_train, count_test, y_train, y_test)

Accuracy Score:   0.817
F-1 Score:   0.488




In [258]:
svm_model = LinearSVC()
fitted_model_bow = evaluate_classifier(svm_model, tfidf_train, tfidf_test, y_train, y_test)

Accuracy Score:   0.870
F-1 Score:   0.518
