In [8]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline 
from sklearn.metrics import classification_report

In [7]:
df = pd.read_csv('final_data.csv')
df.drop(df.columns[0],axis=1,inplace=True)
df.head()

Unnamed: 0,title,view_count,like_count,comment_count,publish_time,duration,language,description,tags,channel,date,time,year,tag_num,title_len,desc_len,short_vid
0,The complete FUN TO IMAGINE with Richard Feynman,4165520,87497,7292,2018-11-01T13:22:07Z,4010.0,en-US,You can find an HD upload at https://youtu.be/...,"['feynman', 'science', 'physics', 'history', '...",Christopher Sykes,2018-11-01,13:22:07,2018,36,8,98,0
1,"There's no such thing as MIRACLE, Richard Feyn...",3894360,145637,2841,2020-08-26T20:21:49Z,320.0,,"In this video, Richard Feynman talks about why...","['study with me', 'study', 'music for studying...",BTY 365,2020-08-26,20:21:49,2020,18,14,79,0
2,Richard Feynman talks about Algebra,1383026,28242,1592,2014-01-22T19:33:22Z,82.0,,From the Pleasure of Finding Things Out. I lov...,"['Richard Feynman (Author)', 'Algebra (Mathema...",David Petro,2014-01-22,19:33:22,2014,3,5,33,0
3,Inside the Mind of Richard Feynman: The Great ...,1295632,40328,2102,2013-03-04T20:01:32Z,624.0,en,"In today's SciShow episode of Great Minds, we'...","['richard feynman', 'quantum electrodynamics',...",SciShow,2013-03-04,20:01:32,2013,25,9,113,0
4,The best teacher I never had,4618209,170957,2872,2016-01-27T16:01:46Z,156.0,en,A video tribute from Bill Gates to Richard Fey...,[],Bill Gates,2016-01-27,16:01:46,2016,2,6,21,0


### Model Building
#### Our final goal is to use Multiple Linear Regression to predict the view counts for a RPF-related video. But in order to do that, first we need to transform our text data into numerical values before feeding them into the model. We will use sklearn text frequency - inverse document frequency model for this task. There are 2 main steps that we need to go through in this processing stage. First, we'll go through them one by one and then, we'll build a pipeline to automate.

* Text Pre-processing: using the simple bag-of-words approach, where each unique word in a text is represented by one number
* Vectorization: we'll convert the list of tokens above into a vector that machine learning models can understand.

 

#### Step 1: Text Pre-processing
The text features for this section include
* Titles
* Description
* Tags

We'll create a function that takes 2 features - title, description - and outputs their respective bag-of-words because these need more preprocessing than tags? Actually no, still need to remove punctuation and stopwords.

TODO: stopwords of other languages?

In [30]:
def tokenize(text):
    '''
    1. remove punctuation
    2. remove stopwords, numbers
    '''
    if type(text) == str:
        no_punc = [char for char in text if char not in string.punctuation]
        no_punc = ''.join(no_punc)
    return [word for word in no_punc.split() if (word.lower() not in stopwords.words('english')) and (not word.isdigit())]

In [31]:
# try applying this
df['tags'].head(5).apply(tokenize)

0    [feynman, science, physics, history, philosoph...
1    [study, study, music, studying, focus, music, ...
2    [Richard, Feynman, Author, Algebra, Mathematic...
3    [richard, feynman, quantum, electrodynamics, c...
4                                                   []
Name: tags, dtype: object

#### Step 2: Vectorization ... explain clearly what needs to be done

In [95]:
# don't need to create func here, later with pipeline

In [32]:
#try this on tags first
bow_transformer = CountVectorizer(analyzer=tokenize).fit(df['tags'])

In [33]:
len(bow_transformer.vocabulary_)

2716

In [34]:
bow_transformer.get_feature_names_out()[:30]

array(['10Cosas', '3K', '3blue1brown', '4K', '4k', '9LX', 'ACME', 'AI',
       'APhDDoesntMakeYouIntelligent', 'Academic', 'Academy',
       'Accommodation', 'Admission', 'Advice', 'Affirmation', 'Age',
       'Alamos', 'Albert', 'Albuquerque', 'Algebra', 'Alt', 'Altyazı',
       'Altyazılı', 'Amazing', 'America', 'American', 'Another',
       'AntiOtário', 'Approach', 'Aprenda'], dtype=object)

In [35]:
df_bow = bow_transformer.transform(df['tags'])

In [48]:
#test 1 item
bow_3 = bow_transformer.transform([df['tags'][3]])
bow_transformer.get_feature_names_out()[1641]
'explainer' in df['tags'][3].lower()

In [77]:
bow_transformer.get_feature_names_out()[1641]

'jewish'

In [94]:
df_bow.shape

(511, 2716)

In [65]:
print(bow_3)

  (0, 233)	1
  (0, 254)	2
  (0, 296)	1
  (0, 298)	2
  (0, 302)	2
  (0, 343)	1
  (0, 477)	1
  (0, 627)	1
  (0, 657)	2
  (0, 825)	1
  (0, 1131)	1
  (0, 1133)	1
  (0, 1233)	1
  (0, 1305)	2
  (0, 1399)	3
  (0, 1505)	1
  (0, 1640)	1
  (0, 1641)	3
  (0, 1725)	1
  (0, 1777)	1
  (0, 1793)	1
  (0, 1796)	1
  (0, 1799)	1
  (0, 1835)	1
  (0, 1913)	1
  (0, 1941)	2
  (0, 2006)	1
  (0, 2020)	1
  (0, 2042)	1
  (0, 2051)	1
  (0, 2094)	2
  (0, 2107)	1
  (0, 2138)	4
  (0, 2210)	3
  (0, 2250)	1
  (0, 2266)	1


In [41]:
# df_bow.toarray()
df_bow.nnz

6497

In [40]:
# calculate sparsity
sparsity = (100*df_bow.nnz/(df_bow.shape[0]*df_bow.shape[1]))
sparsity

0.46812539448769197

Weighing and Normalizing

In [42]:
tfidf_transformer = TfidfTransformer().fit(df_bow)

In [44]:
df_tfidf = tfidf_transformer.transform(df_bow)

In [46]:
df_tfidf.shape

(511, 2716)

In [75]:
print(df_tfidf)

  (0, 2587)	0.17031115887164838
  (0, 2535)	0.1383383351514275
  (0, 2399)	0.16381703032050798
  (0, 2253)	0.08002395274413629
  (0, 2215)	0.19048376466243103
  (0, 2163)	0.19048376466243103
  (0, 2128)	0.17868354673299272
  (0, 2051)	0.0743926877109857
  (0, 2044)	0.12653811722198924
  (0, 1982)	0.19048376466243103
  (0, 1941)	0.11697769018780234
  (0, 1839)	0.17868354673299272
  (0, 1827)	0.19048376466243103
  (0, 1792)	0.12205188257551142
  (0, 1765)	0.19048376466243103
  (0, 1764)	0.19048376466243103
  (0, 1726)	0.1540247062957323
  (0, 1707)	0.19048376466243103
  (0, 1694)	0.11697769018780234
  (0, 1577)	0.15013855308086577
  (0, 1548)	0.17868354673299272
  (0, 1543)	0.15013855308086577
  (0, 1532)	0.16381703032050798
  (0, 1456)	0.14364442452972534
  (0, 1419)	0.15013855308086577
  :	:
  (508, 2572)	0.09544051298999208
  (508, 2461)	0.09544051298999208
  (508, 2309)	0.17066634943335982
  (508, 2308)	0.09544051298999208
  (508, 2020)	0.08533317471667991
  (508, 1584)	0.47720256494

In [78]:
# We can grab the col of each unique word via bow_transformer.vocabulary_
# Then we can get the idf of each word with the fitted tfidf.idf_ 
# get_feature_name_out() = inverse of vocabulary_

bow_transformer.vocabulary_['jewish']

1641

In [132]:
tfidf_transformer.idf_[bow_transformer.vocabulary_['caltech']]

5.041100047703289

In [82]:
tfidf_transformer.idf_[bow_transformer.vocabulary_['electrodynamics']]

4.465735902799727

In [93]:
len(tfidf_transformer.idf_)


2716

We're done preprocessing the text for input. Now let's pass that to the MLR model

In [96]:
# try on the processed tags input first

view_predict_model = LinearRegression().fit(df_tfidf,df['view_count'])

In [99]:
view_predict_model.predict(df_tfidf[1])

array([3894353.48546109])

In [122]:
predicted_view = view_predict_model.predict(df_tfidf)

In [128]:
df.iloc[1][1] # pretty close

3894360

In [127]:
# R2 score
r2_score(df['view_count'],predicted_view) # pretty high, but train and test on the same data

0.8607831528143813

#### Now create the pipeline

In [102]:
# try on the processed tags input first

tag_train, tag_test, label_train, label_test = train_test_split(df['tags'],df['view_count'],test_size=0.2,random_state=1)

In [138]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=tokenize)),
    ('tfidf',TfidfTransformer()),
    ('regressor',LinearRegression())
])

In [129]:
tag_train

42     ['Richard', 'Feynman', 'Messenger', 'Lectures'...
58     ['Richard', 'Feynman', 'understanding', 'physi...
455               ['Ciencias', 'TV', 'Facultad', 'UNAM']
78                                                    []
484                                                   []
                             ...                        
255    ['RichardFeynman', 'PhysicsGenius', 'NobelPriz...
72     ['ciencia', 'richard feynman', 'ciencia difici...
396    ['Richard Feynman', 'Interpretation', 'quantum...
235                                                   []
37     ['richard feynman', 'enrico fermi', 'robert op...
Name: tags, Length: 408, dtype: object

In [139]:
pipeline.fit(tag_train,label_train)

In [140]:
pipeline.score(tag_test,label_test) #this calculate R2 value

-0.3440153761832492

In [105]:
predictions = pipeline.predict(tag_test)

In [111]:
# calculate the MSE ourselves

np.mean((predictions-label_test)**2) 

1293313007511.9368

In [118]:
label_test - predictions

47      70133.369267
345    -37600.630733
284   -493349.712538
221    524972.338025
502    -37135.630733
           ...      
92     182237.541372
225    -42119.630733
411   -628264.006722
329         1.142551
446    -45130.630733
Name: view_count, Length: 103, dtype: float64

In [115]:
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score


In [116]:
mae = mean_squared_error(label_test,predictions)
mse = mean_squared_error(label_test,predictions)
r2 = r2_score(label_test,predictions)
print('Mean Absolute Error:',mae)
print('Mean Squared Error:',mse)
print('R2:',r2)

Mean Absolute Error: 1293313007511.9368
Mean Squared Error: 1293313007511.9368
R2: -0.3440153761832492


Negative R-squared means the model is worse in predicting variation in the data than its mean value. What did we do wrong?
Several reasons for this:
* No relation between tag content and view_counts (could be, since normally we only look at titles to determine whether to watch something or not)
* Something wrong with our tokenization method (guess = compound phrases without space were skipped)
* Better pass this in as a regressor in a multiple linear regression model

In [130]:
# Try to tokenize tags more thoroughly

In [134]:
# Try with titles

title_train, title_test, view_train, view_test = train_test_split(df['title'],df['view_count'],test_size=0.2,random_state=1)
pipeline.fit(title_train,view_train).score(title_test,view_test)



-0.5779314264900488

In [136]:
predictions = pipeline.predict(title_test)
np.mean((predictions-view_test)**2)

1518404681229.7773

In [137]:
mae = mean_squared_error(view_test,predictions)
mse = mean_squared_error(view_test,predictions)
r2 = r2_score(view_test,predictions)
print('Mean Absolute Error:',mae)
print('Mean Squared Error:',mse)
print('R2:',r2)

Mean Absolute Error: 1518404681229.7773
Mean Squared Error: 1518404681229.7773
R2: -0.5779314264900488


In [141]:
pipeline.named_steps['regressor'].coef_

array([ -57566.87984615,  559312.23004949,  550980.23866265, ...,
       -213894.98038508, -213894.98038508,  -13263.37352132])

Since there is no apparent relationship between the text input and view_counts, let's only use the other originally numeric variables in our MLR model.

In [146]:
# We create to_current as years from publishing year to current year
from datetime import datetime
df['to_current'] = datetime.now().year - df['year']


In [152]:
sub_df = df[['like_count','comment_count','duration','to_current','tag_num','title_len','desc_len','short_vid']]

In [153]:
sub_df.head()

Unnamed: 0,like_count,comment_count,duration,to_current,tag_num,title_len,desc_len,short_vid
0,87497,7292,4010.0,5,36,8,98,0
1,145637,2841,320.0,3,18,14,79,0
2,28242,1592,82.0,9,3,5,33,0
3,40328,2102,624.0,10,25,9,113,0
4,170957,2872,156.0,7,2,6,21,0


In [154]:
x_train, x_test, y_train, y_test = train_test_split(sub_df,df['view_count'],test_size=0.1,random_state=1)
x_train.shape, x_test.shape

((459, 8), (52, 8))

In [160]:
rige = Ridge().fit(x_train,y_train).score(x_test,y_test)
rige

0.6245182945891695

In [159]:
LinearRegression().fit(x_train,y_train).score(x_test,y_test)

0.6245130375083291

In [161]:
# sub_df2 = df[['like_count','duration','to_current','tag_num','title_len','desc_len','short_vid']]

TODO: standard scaling input data

examine how to incorporate the text input matrix as one regressor among other numeric variables.
