In [14]:
from sklearn.decomposition import TruncatedSVD
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from yellowbrick.cluster import KElbowVisualizer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report

In [15]:
stopwords = nltk.corpus.stopwords.words('english')
wnl = WordNetLemmatizer()

In [16]:
twitter_df = pd.read_csv('nba_twitter.csv')

In [4]:
def tfidf_vectorizer(dataframe):
    
    tfidf = TfidfVectorizer(stop_words=stopwords)
    tfidf_sparse_mat = tfidf.fit_transform(dataframe['tweet'])
    
    tfidf_df = pd.DataFrame(tfidf_sparse_mat.toarray())
    tfidf_df.columns = tfidf.get_feature_names()
    
    return tfidf_df

In [17]:
twitter_df.head()

Unnamed: 0,date,tweet,source,tweet_tokens,dual_labels,trip_labels,quad_labels,five_labels
0,2021-02-26 12:35:27,Obi Toppin is expected to participate in the N...,Yahoo,"['obi', 'toppin', 'expected', 'participate', '...",0,0,3,1
1,2021-02-25 23:08:17,MPJ SLAM,Yahoo,"['mpj', 'slam', '']",1,2,1,2
2,2021-02-25 17:48:50,Timberwolves G Malik Beasley has been suspende...,Yahoo,"['timberwolves', 'g', 'malik', 'beasley', 'sus...",1,2,1,2
3,2021-02-25 13:40:20,Luka's game-winner against the Celtics vs. Luk...,Yahoo,"['lukas', 'gamewinner', 'celtic', 'v', 'lukas'...",1,2,2,2
4,2021-02-25 13:10:50,It's time to stop underrating the Utah Jazz Fr...,Yahoo,"['time', 'stop', 'underrating', 'utah', 'jazz'...",1,2,1,2


In [18]:
twitter_df.dropna(subset=['tweet'], inplace=True)

In [19]:
vectorizer = TfidfVectorizer(stop_words=stopwords)
twitter_sparse_mat = vectorizer.fit_transform(twitter_df['tweet'])

In [20]:
X = twitter_sparse_mat
y = twitter_df['quad_labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=123)

In [21]:
X

<127031x33623 sparse matrix of type '<class 'numpy.float64'>'
	with 1033100 stored elements in Compressed Sparse Row format>

In [22]:
mnb_vanilla = MultinomialNB()
mnb_vanilla.fit(X_train, y_train)

MultinomialNB()

In [24]:
mnb_vanilla_cv = cross_val_score(mnb_vanilla, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)

In [23]:
def summary_scores(model, train_set_x, test_set_x, train_set_y, test_set_y, cross_val_var):
    print(f'Training Accuracy: {model.score(train_set_x, train_set_y)}')
    print(f'Cross Validation Accuracy: {np.mean(cross_val_var)}')
    print(f'Testing Accuracy: {model.score(test_set_x, test_set_y)}')

In [25]:
summary_scores(mnb_vanilla, X_train, X_test, y_train, y_test, mnb_vanilla_cv)

Training Accuracy: 0.8329640917218655
Cross Validation Accuracy: 0.8154316819166707
Testing Accuracy: 0.8201259511939124


In [27]:
pipe_rfc_vanilla = Pipeline([('pca', TruncatedSVD(n_components=100, random_state=23)),
                    ('rfc', RandomForestClassifier())])

pipe_rfc_vanilla.fit(X_train, y_train)

Pipeline(steps=[('pca', TruncatedSVD(n_components=100, random_state=23)),
                ('rfc', RandomForestClassifier())])

In [28]:
pipe_rfc_vanilla_cv = cross_val_score(pipe_rfc_vanilla, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)

In [29]:
summary_scores(pipe_rfc_vanilla, X_train, X_test, y_train, y_test, pipe_rfc_vanilla_cv)

Training Accuracy: 0.9999662621877846
Cross Validation Accuracy: 0.9929263143477517
Testing Accuracy: 0.9939123589609027
