#**1. Data preparation**

In [None]:
# import libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
import seaborn as sns
import pickle

In [None]:
# load the data
data = pd.read_csv('news_dataset.csv')

In [None]:
data.head()

Unnamed: 0,category,text
0,HEALTH,"When you feel like this, it’s important to kno..."
1,HEALTH,I can vividly remember the first time I felt f...
2,HEALTH,Because it's only becoming more of a struggle....
3,HEALTH,"""The only Whole30 I want to participate in is ..."
4,HEALTH,"Essentially, your hands are kept warm thanks t..."


In [None]:
# categorize the data
data['category_id'] = data['category'].factorize()[0]
colslist = [ 'type', 'news', 'category_id']
data.columns = colslist

In [None]:
data.head()

Unnamed: 0,type,news,category_id
0,HEALTH,"When you feel like this, it’s important to kno...",0
1,HEALTH,I can vividly remember the first time I felt f...,0
2,HEALTH,Because it's only becoming more of a struggle....,0
3,HEALTH,"""The only Whole30 I want to participate in is ...",0
4,HEALTH,"Essentially, your hands are kept warm thanks t...",0


In [None]:
# load the stopwords
text_file = open("stopwords.txt", "r")
stopwords = text_file.read().split('\n')

In [None]:
# remove stop words
data['news_without_stopwords'] = data['news'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
print(len(data['news_without_stopwords'][0]))

In [None]:
# stemming
ps = PorterStemmer()
data['news_porter_stemmed'] = data['news_without_stopwords'].apply(lambda x: ' '.join([ps.stem(word) for word in x.split()]))
data['news_porter_stemmed'] = data['news_porter_stemmed'].apply(lambda x: ' '.join(x.lower() for x in x.split()))
data['news_porter_stemmed'] = data['news_porter_stemmed'].str.replace('[^\w\s]','')

In [None]:
freq = pd.Series(' '.join(data['news_porter_stemmed']).split()).value_counts()


In [None]:
freq2 = freq[freq <= 3]
freq2

In [None]:
freq3 = list(freq2.index.values)
freq3

In [None]:
# create new column
data['news_porter_stemmed'] = data['news_porter_stemmed'].apply(lambda x: ' '.join([word for word in x.split() if word not in (freq3)]))
data = data[['type', 'category_id', 'news_porter_stemmed']]

In [None]:
data.head()

Unnamed: 0,type,category_id,news_porter_stemmed
0,HEALTH,0,when feel this it your the peopl who say im ti...
1,HEALTH,0,i rememb time i fear frustrat food i 10 old i ...
2,HEALTH,0,becaus it becom to stay updat on the new witho...
3,HEALTH,0,the i particip eat 30 perfect tweet about that...
4,HEALTH,0,essentially hand kept warm thank body real rea...


In [None]:
# saved the preprocee data
data.to_csv('preprocessed.csv',index=False)

#**2. Train and test the model**

In [None]:
# tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2))

In [None]:
features = tfidf.fit_transform(data_1.news_porter_stemmed.astype('U')).toarray()
labels = data.category_id
features.shape

(10981, 6835)

In [None]:
data.columns = ['newstype', 'category_id', 'news_porter_stemmed']

In [None]:
category_id_df = data[['newstype', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'newstype']].values)

In [None]:
from sklearn.feature_selection import chi2

N = 3
for newstype, category_id in sorted(category_to_id.items()):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(newstype))
    print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:])))

In [None]:
from sklearn.manifold import TSNE

# Sampling a subset of our dataset because t-SNE is computationally expensive
SAMPLE_SIZE = int(len(features) * 0.3)
np.random.seed(0)
indices = np.random.choice(range(len(features)), size=SAMPLE_SIZE, replace=False)
projected_features = TSNE(n_components=2, random_state=0).fit_transform(features[indices])
colors = ['pink', 'green', 'midnightblue', 'orange', 'darkgrey']

In [None]:
for category, category_id in sorted(category_to_id.items()):
    points = projected_features[(labels[indices] == category_id).values]
    plt.scatter(points[:, 0], points[:, 1], s=30, c=colors[category_id], label=category)
plt.title("tf-idf feature vector for each article, projected on 2 dimensions.",
          fontdict=dict(fontsize=15))
plt.legend()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import cross_val_score


models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [None]:
# train the model with logistic regression
from sklearn.model_selection import train_test_split

model = LogisticRegression(random_state=0)

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, data.index, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)
y_pred = model.predict(X_test)

In [None]:
# visualize
from sklearn.metrics import confusion_matrix
import seaborn as sns

conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.newstype.values, yticklabels=category_id_df.newstype.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')

In [None]:
# fit the model
model.fit(features, labels)

In [None]:

from sklearn.feature_selection import chi2

N = 5
for newstype, category_id in sorted(category_to_id.items()):
    indices = np.argsort(model.coef_[category_id])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]
    bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]
    print("# '{}':".format(newstype))
    print("  . Top unigrams:\n       . {}".format('\n       . '.join(unigrams)))
    print("  . Top bigrams:\n       . {}".format('\n       . '.join(bigrams)))

In [None]:
# save the model
Pkl_Filename = "model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(model, file)

In [None]:
# load the model
with open(Pkl_Filename, 'rb') as file:  
    loaded_model = pickle.load(file)

In [None]:
# some texts to check
texts = [
         "Captain Tsubasa scores a magnificent goal for the Japanese team.",
         "Merryweather mercenaries are sent on another mission, as government oversight groups call for new sanctions.",
         "Ronaldo cured from the cancer disease",
         "You won't guess what the latest trend in data analysis is!",
         "another centuray for vetneran batsman AB de villier",
         "will elon musk the first person to land on the mars",]
text_features = tfidf.transform(texts)
predictions = loaded_model.predict(text_features)
for text, predicted in zip(texts, predictions):
    print('"{}"'.format(text))
    print("  - Predicted as: '{}'".format(id_to_category[predicted]))
    print("")