In [1]:
df = pd.read_csv('../data/fakenews_train.csv')

In [2]:
df[['title', 'text', 'label']]

Unnamed: 0,title,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...
20795,Rapper T.I.: Trump a ’Poster Child For White S...,Rapper T. I. unloaded on black celebrities who...,0
20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",When the Green Bay Packers lost to the Washing...,0
20797,Macy’s Is Said to Receive Takeover Approach by...,The Macy’s of today grew from the union of sev...,0
20798,"NATO, Russia To Hold Parallel Exercises In Bal...","NATO, Russia To Hold Parallel Exercises In Bal...",1


In [3]:
df = df[['id','text','label']]

In [4]:
df = df.fillna('0')

#### CountVectorizer for text classification

In [5]:
# Import the necessary modules
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Print the head of df
print(df.head())

# Create a series to store the labels: y
y = df.label

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size = 0.33, random_state = 53)

# Initialize a CountVectorizer object: count_vectorizer
count_vectorizer = CountVectorizer(stop_words= 'english')

# Transform the training data using only the 'text' column values: count_train 
count_train = count_vectorizer.fit_transform(X_train)

# Transform the test data using only the 'text' column values: count_test 
count_test = count_vectorizer.transform(X_test)

# Print the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:10])

   id                                               text  label
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...      1
1   1  Ever get the feeling your life circles the rou...      0
2   2  Why the Truth Might Get You Fired October 29, ...      1
3   3  Videos 15 Civilians Killed In Single US Airstr...      1
4   4  Print \nAn Iranian woman has been sentenced to...      1
['00', '000', '0000', '0001', '0002', '0002062', '000billion', '000c', '000ft', '000km']


#### TfidfVectorizer for text classification


In [8]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df =0.7)

# Transform the training data: tfidf_train 
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data: tfidf_test 
tfidf_test = tfidf_vectorizer.transform(X_test)

# Print the first 10 features
print(tfidf_vectorizer.get_feature_names()[:10])

# Print the first 5 vectors of the tfidf training data
print(tfidf_train.A[:5])


['00', '000', '0000', '0001', '0002', '0002062', '000billion', '000c', '000ft', '000km']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


#### Inspecting the vectors

To get a better idea of how the vectors work, you'll investigate them by converting them into pandas DataFrames.

In [11]:
# Create the CountVectorizer DataFrame: count_df
count_df = pd.DataFrame( count_train.A, 
            columns=count_vectorizer.get_feature_names())

# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_df = pd.DataFrame(tfidf_train.A, 
            columns = tfidf_vectorizer.get_feature_names())

# Print the head of count_df
print(count_df.head())

   00  000  0000  0001  0002  0002062  000billion  000c  000ft  000km  ...  \
0   0    0     0     0     0        0           0     0      0      0  ...   
1   0    0     0     0     0        0           0     0      0      0  ...   
2   0    0     0     0     0        0           0     0      0      0  ...   
3   0    0     0     0     0        0           0     0      0      0  ...   
4   0    0     0     0     0        0           0     0      0      0  ...   

   鉴于当代挑战和威胁的全球性质  集体安全条约组织  集体安全条约组织与联合国专门机构  集体安全条约组织是多层面结构  \
0               0         0                 0               0   
1               0         0                 0               0   
2               0         0                 0               0   
3               0         0                 0               0   
4               0         0                 0               0   

   集体安全条约组织正在积极促进帮助阿富汗进行冲突后重建以及消除来自该国的毒品威胁的国际努力  集体安全条约组织秘书长博尔久扎  集体提出创新办法  \
0                                             0               

In [12]:
# Print the head of tfidf_df
print(tfidf_df.head())

    00  000  0000  0001  0002  0002062  000billion  000c  000ft  000km  ...  \
0  0.0  0.0   0.0   0.0   0.0      0.0         0.0   0.0    0.0    0.0  ...   
1  0.0  0.0   0.0   0.0   0.0      0.0         0.0   0.0    0.0    0.0  ...   
2  0.0  0.0   0.0   0.0   0.0      0.0         0.0   0.0    0.0    0.0  ...   
3  0.0  0.0   0.0   0.0   0.0      0.0         0.0   0.0    0.0    0.0  ...   
4  0.0  0.0   0.0   0.0   0.0      0.0         0.0   0.0    0.0    0.0  ...   

   鉴于当代挑战和威胁的全球性质  集体安全条约组织  集体安全条约组织与联合国专门机构  集体安全条约组织是多层面结构  \
0             0.0       0.0               0.0             0.0   
1             0.0       0.0               0.0             0.0   
2             0.0       0.0               0.0             0.0   
3             0.0       0.0               0.0             0.0   
4             0.0       0.0               0.0             0.0   

   集体安全条约组织正在积极促进帮助阿富汗进行冲突后重建以及消除来自该国的毒品威胁的国际努力  集体安全条约组织秘书长博尔久扎  集体提出创新办法  \
0                                           0.0         

In [13]:
# Calculate the difference in columns: difference
difference = set(tfidf_df.columns) - set(count_df.columns)
print(difference)

# Check whether the DataFrames are equal
print(count_df.equals(tfidf_df))

set()
False


#### Training and testing the "fake news" model with CountVectorizer


In [18]:
y_train.value_counts

<bound method IndexOpsMixin.value_counts of 1748     1
1795     0
7870     0
11693    1
11048    1
        ..
4596     1
8854     0
19645    1
14075    0
2933     1
Name: label, Length: 13936, dtype: int64>

Naive Bayes classifier 
- Naive Bayes Model
	- commonly used for testing NLP classification problems
	- basis in probability



In [20]:
# Import the necessary modules
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred)
print('')
print(cm)


0.8978729603729604

[[3283  127]
 [ 574 2880]]


#### Training and testing the "fake news" model with TfidfVectorizer


In [21]:
# Create a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(tfidf_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(tfidf_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred)
print(cm)


0.8534382284382285
[[3376   34]
 [ 972 2482]]


#### Improving your model

Your job in this exercise is to test a few different alpha levels using the Tfidf vectors to determine if there is a better performing combination.

In [23]:
# Create the list of alphas: alphas
alphas = np.arange(0,1, 0.1)

# Define train_and_predict()
def train_and_predict(alpha):
    # Instantiate the classifier: nb_classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fit to the training data
    nb_classifier.fit(tfidf_train, y_train)
    # Predict the labels: pred
    pred = nb_classifier.predict(tfidf_test)
    # Compute accuracy: score
    score = metrics.accuracy_score(y_test, pred)
    return score

# Iterate over the alphas and print the corresponding score
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()


Alpha:  0.0
Score:  0.8774766899766899

Alpha:  0.1
Score:  0.9013694638694638

Alpha:  0.2
Score:  0.8945221445221445

Alpha:  0.30000000000000004
Score:  0.8876748251748252

Alpha:  0.4
Score:  0.8806818181818182

Alpha:  0.5
Score:  0.8754370629370629

Alpha:  0.6000000000000001
Score:  0.8703379953379954

Alpha:  0.7000000000000001
Score:  0.8656759906759907

Alpha:  0.8
Score:  0.8611596736596736

Alpha:  0.9
Score:  0.8581002331002331



### Inspecting your model

Now that you have built a "fake news" classifier, you'll investigate what it has learned. You can map the important vector weights back to actual words using some simple inspection techniques.

In [25]:
# Get the class labels: class_labels
class_labels = nb_classifier.classes_

# Extract the features: feature_names
feature_names = tfidf_vectorizer.get_feature_names()

# Zip the feature names together with the coefficient array and sort by weights: feat_with_weights
feat_with_weights = sorted(zip(nb_classifier.coef_[0], feature_names))

# Print the first class label and the top 20 feat_with_weights entries
print(class_labels[0], feat_with_weights[:20])

print('')
print('')
# Print the second class label and the bottom 20 feat_with_weights entries
print(class_labels[1], feat_with_weights[-20:])


0 [(-12.272119266891478, '0000'), (-12.272119266891478, '000c'), (-12.272119266891478, '000s'), (-12.272119266891478, '003'), (-12.272119266891478, '014'), (-12.272119266891478, '0200gmt'), (-12.272119266891478, '021'), (-12.272119266891478, '022'), (-12.272119266891478, '036'), (-12.272119266891478, '046'), (-12.272119266891478, '055'), (-12.272119266891478, '060'), (-12.272119266891478, '061'), (-12.272119266891478, '064'), (-12.272119266891478, '068'), (-12.272119266891478, '072'), (-12.272119266891478, '074'), (-12.272119266891478, '077'), (-12.272119266891478, '0800'), (-12.272119266891478, '085')]


1 [(-7.968809440416186, 'government'), (-7.955071684537176, 'time'), (-7.928203647965548, 'state'), (-7.91901515859573, 'media'), (-7.905447278651641, 'president'), (-7.884950835742675, 'like'), (-7.884059282507659, 'war'), (-7.853272523785743, 'obama'), (-7.832332443544034, 'new'), (-7.795349623141636, 'world'), (-7.780378901871515, 'just'), (-7.762635293021732, 'said'), (-7.73927071