In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import progressbar
import pickle
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer


from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.ensemble import AdaBoostClassifier

from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import scikitplot as skplt

pd.set_option("display.max_rows", None)
np.set_printoptions(threshold=np.inf)

In [None]:
# load the original dataset
df = pd.read_csv('data/USvideos.csv')

In [None]:
df.columns

In [None]:
df.info()

In [None]:
# drop rows with missing values
df.dropna(inplace=True)
# df = df.dropna()

In [None]:
df.info()

In [None]:
# drop all unnecessary columns
df.drop(df.columns[[0,1,5,7,8,9,10,11,12,13,14]], axis=1, inplace=True)
# df = df.drop(df.columns[[0,1,5,7,8,9,10,11,12,13,14]], axis=1)

In [None]:
df.drop_duplicates(subset=["title","channel_title","tags","description"], inplace=True)
df.info()

# TEST

In [None]:
# print(df.description[0])
# print(df.tags[1])
# print(df.tags[2])
# print(df.tags[3])
print(df.description[52])
# print(df.tags[40948])

In [None]:
# remove urls from string
# re.sub(r'^https?:\/\/.*[\r\n]*', '', df.description[52], flags=re.MULTILINE)
line = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', df.description[52])
line

In [None]:
# tokenize and lowercase the strings
tokens = nltk.word_tokenize(line)
tokens = [word.lower() for word in tokens]
tokens

In [None]:
# strip punctuation from words
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
stripped

In [None]:
# remove all non words
words = [word for word in stripped if word.isalpha()]
words

In [None]:
# remove all non important words
stop_words = nltk.corpus.stopwords.words('english')
words = [w for w in words if not w in stop_words]
words

In [None]:
# stems words to their roots
porter = nltk.stem.porter.PorterStemmer()
stemmed = [porter.stem(word) for word in words]
stemmed

In [None]:
%%time
table = str.maketrans('', '', string.punctuation)
stop_words = nltk.corpus.stopwords.words('english')
porter = nltk.stem.porter.PorterStemmer()
index = 0
for row in progressbar.progressbar(df.title):
    tokens = nltk.word_tokenize(row)
    tokens = [word.lower() for word in tokens]
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    words = [w for w in words if not w in stop_words]
    stemmed = [porter.stem(word) for word in words]
    df.title[index] = ';'.join(stemmed)
    index += 1
    
index = 0
for row in progressbar.progressbar(df.channel_title):
    tokens = nltk.word_tokenize(row)
    tokens = [word.lower() for word in tokens]
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    words = [w for w in words if not w in stop_words]
    stemmed = [porter.stem(word) for word in words]
    df.description[index] = ';'.join(stemmed)
    index += 1
    
# index = 0
# for row in progressbar.progressbar(df.tags):
#     tokens = nltk.word_tokenize(row)
#     tokens = [word.lower() for word in tokens]
#     stripped = [w.translate(table) for w in tokens]
#     words = [word for word in stripped if word.isalpha()]
#     words = [w for w in words if not w in stop_words]
#     stemmed = [porter.stem(word) for word in words]
#     df.tags[index] = ';'.join(stemmed)
#     index += 1
    
# index = 0
# for row in progressbar.progressbar(df.tags):
#     tokens = nltk.word_tokenize(row)
#     tokens = [word.lower() for word in tokens]
#     stripped = [w.translate(table) for w in tokens]
#     words = [word for word in stripped if word.isalpha()]
#     words = [w for w in words if not w in stop_words]
#     stemmed = [porter.stem(word) for word in words]
#     df.title[index] = ';'.join(stemmed)
#     index += 1

## END TEST

In [None]:
def clean(df, column):
    '''
        Cleans the strings in specified column
    '''
    list = []
    print(column)
    table = str.maketrans('', '', string.punctuation)
    stop_words = nltk.corpus.stopwords.words('english')
    porter = nltk.stem.porter.PorterStemmer()
    for row in progressbar.progressbar(df[column]):
        index = df[column][df[column] == row].index.tolist()[0]
        line = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', row)
        tokens = nltk.word_tokenize(line)
        tokens = [word.lower() for word in tokens]
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
        words = [w for w in words if not w in stop_words]
        stemmed = [porter.stem(word) for word in words]
        stemmed = [word.strip() for word in stemmed if len(word) > 3]
#         df[column][index] = ';'.join(stemmed)
        df[column][index] = ' '.join(stemmed)

In [None]:
%%time
title = clean(df, "title")
channel_title = clean(df, "channel_title")
tags = clean(df, "tags")
description = clean(df, "description")

In [None]:
df.title

In [None]:
df.channel_title

In [None]:
df.channel_title[23]

In [None]:
df.tags

In [None]:
df.description

In [None]:
# drop all the empty strings
df.replace('', float("NaN"), inplace=True)
df.dropna(inplace=True)

In [None]:
df

In [None]:
# DON'T DO THIS, CATEGORIES ARE ALREADY ENCODED FOR US
with open('data/US_category_id.json', 'r') as f:
    data = json.load(f)
    for item in data['items']:
        df.replace({'category_id': int(item['id'])}, item['snippet']['title'], inplace=True)

In [None]:
df.category_id

In [None]:
# drop all the nones
df.replace('none', float("NaN"), inplace=True)
df.dropna(inplace=True)
df = df[['title', 'channel_title', 'tags', 'description', 'category_id']]
df.info()

In [None]:
df.drop(df.columns[[1]], axis=1, inplace=True)
print(df.info())
df

## Save Dataframe

In [None]:
df.to_csv('data/clean.csv', columns=['title','channel_title','category_id','tags','description'], index=False)

In [None]:
%%time
# save df object
with open('data/df.pkl', 'wb') as f:
    pickle.dump(df, f)

## Load Dataframe

In [2]:
%%time
# save df object
with open('data/df.pkl', 'rb') as f:
    df = pickle.load(f)

Wall time: 6.63 ms


In [None]:
df

## TFIDF Vectorization

In [4]:
%%time
tfidf_title = TfidfVectorizer()
tfidf_tags = TfidfVectorizer()
tfidf_description = TfidfVectorizer()

features_title = tfidf_title.fit_transform(df.title).toarray()
features_tags = tfidf_tags.fit_transform(df.tags).toarray()
features_description = tfidf_description.fit_transform(df.description).toarray()
labels = df.category_id
print('Title Features Shape: ' + str(features_title.shape))
print('Tags Features Shape: ' + str(features_tags.shape))
print('Description Features Shape: ' + str(features_description.shape))

Title Features Shape: (6263, 6806)
Tags Features Shape: (6263, 16129)
Description Features Shape: (6263, 39441)
Wall time: 889 ms
