In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import progressbar
import pickle
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV


from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import LinearSVC

from sklearn import metrics
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import scikitplot as skplt

pd.set_option("display.max_rows", None)
np.set_printoptions(threshold=np.inf)

In [None]:
# load the original dataset
df = pd.read_csv('data/USvideos.csv')

In [None]:
# drop rows with missing values
df.dropna(inplace=True)

In [None]:
# drop all unnecessary columns
df.drop(df.columns[[0,1,5,7,8,9,10,11,12,13,14]], axis=1, inplace=True)

In [None]:
df.drop_duplicates(subset=["title","channel_title","tags","description"], inplace=True)
df.info()

In [None]:
def clean(df, column):
    '''
        Cleans the strings in specified column
    '''
    list = []
    print(column)
    table = str.maketrans('', '', string.punctuation)
    stop_words = nltk.corpus.stopwords.words('english')
    porter = nltk.stem.porter.PorterStemmer()
    for row in progressbar.progressbar(df[column]):
        index = df[column][df[column] == row].index.tolist()[0]
        line = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', row)
        tokens = nltk.word_tokenize(line)
        tokens = [word.lower() for word in tokens]
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
        words = [w for w in words if not w in stop_words]
        stemmed = [porter.stem(word) for word in words]
        stemmed = [word.strip() for word in stemmed if len(word) > 3]
        df[column][index] = ' '.join(stemmed)

In [None]:
%%time
title = clean(df, "title")
channel_title = clean(df, "channel_title")
tags = clean(df, "tags")
description = clean(df, "description")

In [None]:
# drop all the empty strings
df.replace('', float("NaN"), inplace=True)
df.dropna(inplace=True)

In [None]:
# drop all the nones
df.replace('none', float("NaN"), inplace=True)
df.dropna(inplace=True)
df = df[['title', 'channel_title', 'tags', 'description', 'category_id']]
df.info()

In [None]:
df.drop(df.columns[[1]], axis=1, inplace=True)
print(df.info())
df

## Save Dataframe

In [None]:
df.to_csv('data/clean.csv', columns=['title','channel_title','category_id','tags','description'], index=False)

In [None]:
%%time
# save df object
with open('data/df.pkl', 'wb') as f:
    pickle.dump(df, f)

## Load Dataframe

In [2]:
%%time
# save df object
with open('data/df.pkl', 'rb') as f:
    df = pickle.load(f)

Wall time: 6.98 ms


## TFIDF Vectorization

In [3]:
%%time
tfidf_title = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
tfidf_tags = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
tfidf_description = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features_title = tfidf_title.fit_transform(df.title).toarray()
features_tags = tfidf_tags.fit_transform(df.tags).toarray()
features_description = tfidf_description.fit_transform(df.description).toarray()
labels = df.category_id
print('Title Features Shape: ' + str(features_title.shape))
print('Tags Features Shape: ' + str(features_tags.shape))
print('Description Features Shape: ' + str(features_description.shape))

Title Features Shape: (6263, 1860)
Tags Features Shape: (6263, 8074)
Description Features Shape: (6263, 17458)
Wall time: 1.89 s


# Train Test Split

In [4]:
%%time
# X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:3], df.category_id, random_state=123)
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:4], df.category_id, random_state=123)
X_train_title_features = tfidf_title.transform(X_train.title).toarray()
X_train_tags_features = tfidf_tags.transform(X_train.tags).toarray()
X_train_description_features = tfidf_description.transform(X_train.description).toarray()

X_test_title_features = tfidf_title.transform(X_test.title).toarray()
X_test_tags_features = tfidf_tags.transform(X_test.tags).toarray()
X_test_description_features = tfidf_description.transform(X_test.description).toarray()

features = np.concatenate([X_train_title_features, X_train_tags_features, X_train_description_features], axis=1)
test_features = np.concatenate([X_test_title_features, X_test_tags_features, X_test_description_features], axis=1)

Wall time: 1.82 s


In [5]:
features.shape

(4697, 27392)

In [6]:
test_features.shape

(1566, 27392)

# Linear SVC

In [5]:
svm = LinearSVC(random_state=123)

In [9]:
%%time
svm.fit(features, X_train.category_id)

Wall time: 1.45 s


LinearSVC(random_state=123)

In [10]:
%%time
svm.score(test_features, X_test.category_id)

Wall time: 168 ms


0.8352490421455939

In [13]:
%%time
predictions = svm.predict(test_features)
f1_score(X_test.category_id, predictions, average='macro')

Wall time: 135 ms


0.7424935741879264

In [6]:
parameters = {
    'loss': ('hinge', 'squared_hinge'),
    'C':[1,5,10,15,20],
    'fit_intercept': (True, False),
    'tol':[1e-4, 1e-6, 1e-8],
}
clf = GridSearchCV(svm, parameters, n_jobs=-1, cv=4)

In [7]:
%%time
clf.fit(features, X_train.category_id)

Wall time: 4min 42s


GridSearchCV(cv=4, estimator=LinearSVC(random_state=123), n_jobs=-1,
             param_grid={'C': [1, 5, 10, 15, 20],
                         'fit_intercept': (True, False),
                         'loss': ('hinge', 'squared_hinge'),
                         'tol': [0.0001, 1e-06, 1e-08]})

In [19]:
sorted(clf.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_C',
 'param_fit_intercept',
 'param_loss',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [8]:
print(clf.best_score_)
print(clf.best_params_)

0.8273344448874551
{'C': 1, 'fit_intercept': False, 'loss': 'squared_hinge', 'tol': 0.0001}


In [9]:
%%time
predictions = clf.best_estimator_.predict(test_features)
f1_score(X_test.category_id, predictions, average='macro')

Wall time: 99.2 ms


0.8017550411667584

In [10]:
clf.__dict__

{'scoring': None,
 'estimator': LinearSVC(random_state=123),
 'n_jobs': -1,
 'iid': 'deprecated',
 'refit': True,
 'cv': 4,
 'verbose': 0,
 'pre_dispatch': '2*n_jobs',
 'error_score': nan,
 'return_train_score': False,
 'param_grid': {'loss': ('hinge', 'squared_hinge'),
  'C': [1, 5, 10, 15, 20],
  'fit_intercept': (True, False),
  'tol': [0.0001, 1e-06, 1e-08]},
 'multimetric_': False,
 'best_index_': 9,
 'best_score_': 0.8273344448874551,
 'best_params_': {'C': 1,
  'fit_intercept': False,
  'loss': 'squared_hinge',
  'tol': 0.0001},
 'best_estimator_': LinearSVC(C=1, fit_intercept=False, random_state=123),
 'refit_time_': 1.1345369815826416,
 'scorer_': <function sklearn.metrics._scorer._passthrough_scorer(estimator, *args, **kwargs)>,
 'cv_results_': {'mean_fit_time': array([ 9.16878194,  9.87211186,  9.05220842,  2.6953004 ,  3.04841167,
          3.10390186, 10.22409517, 11.52250379, 11.993545  ,  2.67486781,
          2.93687904,  3.48189014,  7.82627183,  9.18281406,  8.9809430

In [12]:
type(clf.best_estimator_)

sklearn.svm._classes.LinearSVC

In [13]:
with open('data/linearSVC.pkl', 'wb') as f:
    pickle.dump(clf.best_estimator_, f)

In [14]:
with open('data/tfidf_title.pkl', 'wb') as f:
    pickle.dump(tfidf_title, f)

In [15]:
with open('data/tfidf_description.pkl', 'wb') as f:
    pickle.dump(tfidf_description, f)
    
with open('data/tfidf_tags.pkl', 'wb') as f:
    pickle.dump(tfidf_tags, f)

## Count Vectorizer

In [3]:
%%time
vectorizer = CountVectorizer(stop_words="english")
features = vectorizer.fit_transform(df.title + ' ' + df.tags + ' ' + df.description)

Wall time: 521 ms


In [10]:
X_train, X_test, y_train, y_test = train_test_split(features, df.category_id, random_state=123)

In [13]:
svm = LinearSVC(random_state=123)
parameters = {
    'loss': ('hinge', 'squared_hinge'),
    'C':[1,5,10,15,20],
    'fit_intercept': (True, False),
    'tol':[1e-4, 1e-6, 1e-8],
}
clf = GridSearchCV(svm, parameters, n_jobs=-1, cv=4)

In [16]:
%%time
clf.fit(X_train, y_train)

Wall time: 2min 12s




GridSearchCV(cv=4, estimator=LinearSVC(random_state=123), n_jobs=-1,
             param_grid={'C': [1, 5, 10, 15, 20],
                         'fit_intercept': (True, False),
                         'loss': ('hinge', 'squared_hinge'),
                         'tol': [0.0001, 1e-06, 1e-08]})

In [17]:
print(clf.best_score_)
print(clf.best_params_)

0.7790054007031788
{'C': 1, 'fit_intercept': True, 'loss': 'squared_hinge', 'tol': 0.0001}


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6263 entries, 0 to 40846
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        6263 non-null   object
 1   tags         6263 non-null   object
 2   description  6263 non-null   object
 3   category_id  6263 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 244.6+ KB
