# New York Times Article Analysis

BitTiger DS501


## Natural Language Processing

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import nltk.data
import numpy as np

In [None]:
# If you are running NLTK for the first time, try download NLTK stopwords
# uncomment to run below:
# nltk.download("stopwords")

# Optionally, you can also download all data from NLTK, this step takes time
# As you as you download NLTK once, the data will reside in your machine
# uncomment to run below:
# nltk.download("all")  # optional

In [None]:
categories = ['comp.graphics', 'rec.sport.baseball', 'sci.med', 'talk.politics.misc']
data = fetch_20newsgroups(subset='train', categories=categories).data
newsgroups = fetch_20newsgroups(subset='train', categories=categories)

In [None]:
type(data), len(data)

In [None]:
data[0]

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from string import punctuation

In [None]:
from nltk.stem.porter   import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet  import WordNetLemmatizer

print(SnowballStemmer('english').stem('running'))
print(WordNetLemmatizer().lemmatize('caused'))

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
vectors = vectorizer.fit_transform(data).toarray()
words = vectorizer.get_feature_names()

In [None]:
vectors.shape

In [None]:
words

In [None]:
def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]

In [None]:
avg = np.sum(vectors, axis=0) / np.sum(vectors > 0, axis=0)
print("top 10 by average tf-idf")
print(get_top_values(avg, 10, words))

In [None]:
total = np.sum(vectors, axis=0)
print("top 10 by total tf-idf")
print(get_top_values(total, 10, words))

In [None]:
# redo vectorization without using idf
vectorizer2 = TfidfVectorizer(use_idf=False, max_features=2000)
# make documents into one giant document for this purpose
vectors2 = vectorizer2.fit_transform(["\n".join(data)]).toarray()
print("top 10 by tf across all corpus")
print(get_top_values(vectors2[0], 10, words))

In [None]:
all_newsgroups = fetch_20newsgroups()
all_data = np.array(all_newsgroups.data)

for i, category in enumerate(all_newsgroups.target_names):
    data = all_data[all_newsgroups.target == i]
    vectorizer = TfidfVectorizer(stop_words='english')
    vectors = vectorizer.fit_transform(data).toarray()
    words = vectorizer.get_feature_names()
    print("Category: %s" % category)
    avg = np.sum(vectors, axis=0) / np.sum(vectors > 0, axis=0)
    print("  Top 10 by average tf-idf")
    print("    %s" % ", ".join(get_top_values(avg, 10, words)))
    total = np.sum(vectors, axis=0)
    print("  Top 10 by total tf-idf")
    print("    %s" % ", ".join(get_top_values(total, 10, words)))
    print("-----------------------------")

## Documents Classification

### Load Train Data

In [None]:
data = fetch_20newsgroups(subset='train', categories=categories).data
target = fetch_20newsgroups(subset='train', categories=categories).target

In [None]:
len(data), type(data)

In [None]:
target.shape

### Vectorize documents

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
vectors = vectorizer.fit_transform(data).toarray()
words = vectorizer.get_feature_names()

In [None]:
vectors.shape

In [None]:
X = vectors
y = target

### Load and Transform Test Data

In [None]:
test_data = fetch_20newsgroups(subset='test', categories=categories).data
test_target = fetch_20newsgroups(subset='test', categories=categories).target

In [None]:
y_test = test_target
X_test = vectorizer.transform(test_data)

### Build classifiers with sklearn 

#### Let's first try Logisitc Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X, y)

In [None]:
model.score(X, y)

In [None]:
model.coef_.shape

In [None]:
# Get top words that makes prediciton of such a category
num_category = 0

print(categories[num_category])

get_top_values(model.coef_[num_category], 10, words)

In [None]:
# Get score for training set
model.score(X, y)

In [None]:
# Get score for test set
model.score(X_test, y_test)

#### Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(X, y)

In [None]:
# Get score for training set
model.score(X, y)

In [None]:
# Get score for test set
model.score(X_test, y_test)

In [None]:
X.shape

#### Randorm Forest Classifier

In [None]:
# use one vs rest classifier for multi-class classification
from sklearn.multiclass import OneVsRestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier(n_estimators=200,max_depth=50,min_samples_leaf=3, random_state=1)
model = OneVsRestClassifier(estimator,n_jobs=-1)

model.fit(X, y)

In [None]:
# Get score for training set
model.score(X, y)

In [None]:
# Get score for test set
model.score(X_test, y_test)

#### Gradient Boosted Trees

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

estimator = GradientBoostingClassifier(n_estimators=100,learning_rate=0.1,max_depth=5,min_samples_leaf=2, random_state=1)
model = OneVsRestClassifier(estimator,n_jobs=-1)

model.fit(X, y)

In [None]:
# Get score for training set
model.score(X, y)

In [None]:
# Get score for test set
model.score(X_test, y_test)