In [4]:
import pandas as pd
import nltk

nltk.download('stopwords')

#Load dataset
ytc = pd.read_csv('YoutubeCommentsDataSet.csv')

ytc = ytc.dropna()
ytc.head()

[nltk_data] Downloading package stopwords to /home/repl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


In [38]:
import pandas as pd
from nltk.corpus import stopwords as nltk_stopwords
import spacy


# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

# Preprocessing function
def preprocess(text):
    doc = nlp(text, disable=['ner', 'parser'])
    tokens = [token for token in doc if token.text.isalpha()]
    lemmas = [token.lemma_ for token in tokens if token.pos_ not in ['DET', 'NUM', 'SYM', 'X']]
    #pos = [token.pos_ for token in doc]
    #print(pos[:10])
    # Load stopwords from NLTK
    #stopwords = set(nltk_stopwords.words('english'))
    # Remove stopwords characters
    #a_lemmas = [lemma for lemma in lemmas if lemma not in stopwords]
    
    return ' '.join(lemmas)

# Preprocessing dataset 
ytp = pd.DataFrame(columns=['Comment'])

comments_list = ytc['Comment'].tolist() 

for i, comment in enumerate(comments_list):
    if i <= (ytc.shape[0]): #-18000 subtract to limit rows for tests
        comment = preprocess(comment)
        ytp = pd.concat([ytp, pd.DataFrame({'Comment': [comment]}, index=[i])])

ytp.head()

Unnamed: 0,Comment
0,let not forget that apple pay in require brand...
1,here in nz of retailer do even have contactles...
2,I will forever acknowledge channel with help o...
3,whenever I go to place that do take apple pay ...
4,apple pay be so convenient secure and easy to ...


In [33]:
#1 dataset without preprocessing

X = ytc['Comment'].copy()
y = ytc['Sentiment'].copy()

#2 dataset with preprocessing

Xp = ytp['Comment'].copy()
yp = ytc['Sentiment'].copy()

sent = {'neutral': 1, 'negative':0, 'positive':2}
yp= yp.map(sent)

In [34]:
#Fit count vectorizer with raw data

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

cv = CountVectorizer(min_df=3, max_df=0.7, ngram_range=(1,2))#min_df=3, ngram_range=(1,2))
cv_train = cv.fit_transform(X_train.values)
cv_test = cv.transform(X_test.values)
cv_train.shape

(14691, 32706)

In [35]:
#Fit count vectorizer with preprocessed data

Xp_train, Xp_test, yp_train, yp_test = train_test_split(Xp, yp, test_size=0.2, random_state=123)

cvp = CountVectorizer(min_df=3, max_df=0.7, ngram_range=(1,2))#min_df=3, ngram_range=(1,2))
cvp_train = cvp.fit_transform(Xp_train.values)
cvp_test = cvp.transform(Xp_test.values)
cvp_train.shape

(14691, 28743)

In [36]:
# Compare datasets with multinomial naive bayes classifier model

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

print('Scores for MultinomialNB:')

mnb = MultinomialNB()
mnb.fit(cv_train, y_train)
y_pred = mnb.predict(cv_test)
print('Raw dataset: ', round(metrics.accuracy_score(y_test, y_pred), 4))
# metrics.confusion_matrix(y_test, y_pred)

mnb_cl = MultinomialNB()
mnb_cl.fit(cvp_train, yp_train)
yp_pred = mnb_cl.predict(cvp_test)
print(f'Preprocessed dataset: {round(metrics.accuracy_score(yp_test, yp_pred), 4)}')
# metrics.confusion_matrix(y_test, y_pred)


Scores for MultinomialNB:
Raw dataset:  0.7228
Preprocessed dataset: 0.7245


In [37]:
# Compare datasets with support vector machines classifier model
from sklearn.svm import SVC

print('Scores for SVC:')

svc = SVC(kernel='linear')
svc.fit(cv_train, y_train)
ys_pred = svc.predict(cv_test)
print(f'Raw dataset: {round(metrics.accuracy_score(y_test, ys_pred), 4)}')

svc = SVC(kernel='linear')
svc.fit(cvp_train, yp_train)
yp_pred = svc.predict(cvp_test)
print(f'Preprocessed dataset: {round(metrics.accuracy_score(yp_test, yp_pred), 4)}')

Scores for SVC:
Raw dataset: 0.7329
Preprocessed dataset: 0.7422
