In [1]:
%load_ext autotime

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import unicodedata
import nltk
import codecs
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

time: 1.72 s


In [4]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

time: 79.8 ms


In [5]:
df = pd.read_pickle('./datasets/news_data.pickle')

time: 362 ms


In [6]:
df.head()

Unnamed: 0,URL,CATEGORY,content and summary
50890,http://blogs.wsj.com/moneybeat/2014/03/26/fed-...,business,[The Federal Reserve approved Ally Financial I...
50891,http://cumberlink.com/news/national/duke-share...,business,[]
50892,http://www.bizjournals.com/charlotte/blog/ener...,business,[]
50893,http://www.ky3.com/news/local/duke-energy-shar...,business,[]
50894,http://www.chem.info/news/2014/03/regulators-m...,business,[]


time: 9.97 ms


In [7]:
df = df.dropna()
df.shape

(108043, 3)

time: 35.9 ms


In [8]:
df = df.loc[:,['CATEGORY','content and summary']]

time: 9.97 ms


In [9]:
df = df[df['content and summary'].map(lambda d: len(d)) > 0]

time: 67.8 ms


In [10]:
df.shape

(37033, 2)

time: 6.04 ms


In [11]:
def get_content(row):
    row = row[0]
    return row
def get_summary(row):
    row = row[1]
    return row

time: 2 ms


In [12]:

df['content'] = df['content and summary'].apply(get_content)
df['summary'] = df['content and summary'].apply(get_summary)

time: 49.9 ms


In [13]:
df.head()

Unnamed: 0,CATEGORY,content and summary,content,summary
50890,business,[The Federal Reserve approved Ally Financial I...,The Federal Reserve approved Ally Financial In...,The Federal Reserve approved Ally Financial In...
50898,business,[— Major shareholders of Duke Energy Corp. hav...,— Major shareholders of Duke Energy Corp. have...,— Major shareholders of Duke Energy Corp. have...
50900,business,[Photos taken earlier this month show that Nor...,Photos taken earlier this month show that Nort...,Photos taken earlier this month show that Nort...
50903,business,[Thanks to dogged reporting by the Associated ...,Thanks to dogged reporting by the Associated P...,Thanks to dogged reporting by the Associated P...
50906,business,[The energy giant says it is committed to clea...,The energy giant says it is committed to clean...,The energy giant says it is committed to clean...


time: 11 ms


In [14]:
df.drop(['content and summary','summary'],axis=1, inplace=True)


time: 21.9 ms


In [15]:
df.head()

Unnamed: 0,CATEGORY,content
50890,business,The Federal Reserve approved Ally Financial In...
50898,business,— Major shareholders of Duke Energy Corp. have...
50900,business,Photos taken earlier this month show that Nort...
50903,business,Thanks to dogged reporting by the Associated P...
50906,business,The energy giant says it is committed to clean...


time: 9.97 ms


In [16]:
df = df.rename(columns= {'CATEGORY':'category'})

time: 3.99 ms


In [17]:
df.head()

Unnamed: 0,category,content
50890,business,The Federal Reserve approved Ally Financial In...
50898,business,— Major shareholders of Duke Energy Corp. have...
50900,business,Photos taken earlier this month show that Nort...
50903,business,Thanks to dogged reporting by the Associated P...
50906,business,The energy giant says it is committed to clean...


time: 12 ms


#### Before cleaning the text

In [18]:
for i in range(1):
    print("Content")
    print("----------------------")
    print(df.content.iloc[i])
#     print("Summary")
#     print("----------------------")
#     print(df.summary.iloc[i])

Content
----------------------
The Federal Reserve approved Ally Financial Inc.’s capital plan in the bank regulator’s annual review of the industry’s financial health, clearing another potential hurdle to the auto lender’s plans to exit government ownership.

Ally’s plan was approved after the Federal Reserve found that the bank could keep lending in a severe economic downturn, according to a report Wednesday.
time: 1.99 ms


In [19]:
def clean_text(text):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Add contractions
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    
    text =  ''.join(
        c for c in unicodedata.normalize('NFD', text)
        if unicodedata.category(c) != 'Mn'
    )
    
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)

    return text

time: 6.98 ms


In [20]:
df['content'] = df['content'].apply(clean_text)


time: 26.1 s


#### After cleaning the text

In [21]:
for i in range(1):
    print("Content ")
    print("----------------------")
    print(df.content.iloc[i])
    #print("Summary")
    #print("----------------------")
    #print(df.summary.iloc[i])
    

Content 
----------------------
federal reserve approved ally financial inc ’s capital plan bank regulator’s annual review industry’s financial health clearing another potential hurdle auto lender’s plans exit government ownership ally’s plan approved federal reserve found bank could keep lending severe economic downturn according report wednesday
time: 2 ms


In [22]:
df.head()

Unnamed: 0,category,content
50890,business,federal reserve approved ally financial inc ’s...
50898,business,— major shareholders duke energy corp called c...
50900,business,photos taken earlier month show north carolina...
50903,business,thanks dogged reporting associated press know ...
50906,business,energy giant says committed cleaning dan river...


time: 29.9 ms


#### Encoding the categories

In [23]:
label_encoder = LabelEncoder() 

df['category']= label_encoder.fit_transform(df['category'])

time: 17 ms


In [24]:
df.sample(5)

Unnamed: 0,category,content
82862,1,kim kardashian mario dedivanovic league ⁣ ⁣ du...
74733,1,gwyneth paltrow chris martin staying civil pos...
80390,0,happens towards end year banks trying build ce...
77733,3,four episodes ‘cosmos’ continues amaze quality...
23823,2,photo karandaev getty images one tequila two t...


time: 22.9 ms


In [25]:
X_train, X_test, y_train, y_test = train_test_split(df['content'],df['category'],test_size=0.3,random_state=0)

time: 28.9 ms


#### Bag of words

In [26]:
vectorizer = CountVectorizer(min_df=2, ngram_range=(2,2))
bow_vector_train = vectorizer.fit_transform(X_train)
bow_vector_test = vectorizer.transform(X_test)

time: 22.3 s


In [27]:
clf = RandomForestClassifier(n_estimators=10)

time: 1.03 ms


In [28]:
clf.fit(bow_vector_train,y_train )

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

time: 3min 52s


In [29]:
pred = clf.predict(bow_vector_test)

time: 265 ms


In [30]:
accuracy_score(pred, y_test)

0.8900990099009901

time: 3 ms


#### Tf-idf

In [31]:
tfidvectorizer = TfidfVectorizer(min_df=6, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True)
tfid_train_vector = tfidvectorizer.fit_transform(X_train)

tfid_test_vector = tfidvectorizer.transform(X_test)

time: 6.8 s


In [32]:
clf = RandomForestClassifier(n_estimators=10)

Error in callback <function LineWatcher.stop at 0x00000160C876E0D0> (for post_run_cell):


AssertionError: 

In [33]:
clf.fit(tfid_train_vector,y_train )

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

time: 22.8 s


In [34]:
pred = clf.predict(tfid_test_vector)

time: 117 ms


In [35]:
accuracy_score(pred, y_test)

0.8821782178217822

time: 2.99 ms
