### importing libraries

In [None]:
import json
import re
from stop_words import get_stop_words
import tensorflow as tf
import keras
from collections import Counter

In [None]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD

In [None]:
import os
os.getcwd()

### reading in the json data into a list

In [None]:
data = []
for line in open("../input/News_Category_Dataset.json",'r'):
    data.append(json.loads(line))

### loading the common english stop words into a list

In [None]:
from nltk.corpus import stopwords
stop_words = list(get_stop_words('en'))
stop_words = stop_words + ['']

## fucntion description
##### 1) Function uses the JSON data imported into the list data
##### 2) using the list data we extract headline and short description and store it it seperate lists
##### 3) extracted the category and stored it in a different list
##### 4) Then did a stop word removal on the data so that the n-gram frequency is not dominated by stop words
##### 5) Converted the file list to a NLTK Text so that we can access rich collection of NLTK functions
##### 6) Used NLTK Freqdist and bi-grams and Counter from Collections to get the unigram and bigram counts
##### 7) Convereted the dictionary to a pandas file and then exported the file to disk using to_excel

In [None]:
def ngram_counter():
    sentences=[]
    global data
    target=[]
    for i in range(0,len(data)):
        if data[i]['headline'] != '':
            data[i]['headline'] = data[i]['headline'].replace("'s"," is")
            data[i]['short_description'] = data[i]['short_description'].replace("'s"," is")
            data[i]['headline'] = re.sub(r'[^\w\s]','',data[i]['headline']).lower()
            data[i]['short_description'] = re.sub(r'[^\w\s]','',data[i]['short_description']).lower()
            sentences.append(data[i]['headline']+' '+data[i]['short_description'])
            target.append(data[i]['category'])
    data_new=[]
    for sentence in sentences:
        for word in sentence.split(" "):
            if word not in stop_words:
                data_new.append(word)
    
    nlp = nltk.Text(data_new)
    
    freq = nltk.FreqDist(nlp)
    bi_grams = nltk.bigrams(nlp)
    bi_grams = list(bi_grams)
    bi_grams_counts = Counter(list(bi_grams))
    
    unigram = (pd.DataFrame.from_dict(data=freq,orient='index')).rename(columns = {0:'frequency'})
    bi_gram = (pd.DataFrame.from_dict(data=bi_grams_counts,orient='index')).rename(columns = {0:'frequency'})
    
    writer = pd.ExcelWriter('ngrams4.xlsx')
    
    unigram.to_excel(writer,sheet_name='one_gram')
    bi_gram.to_excel(writer,sheet_name='bi_gram')
    writer.save()
    
    return nlp,bi_grams_counts,sentences,target

#### checking the function execution time

##### 1) importing the time libary which returns the unix time at the instant
##### 2) saving the start time to a variable start
##### 3) running the ngrams function
##### 4) running the time.time() to save the stop time
##### 5) printing the difference between start and stop which is the time taken by the function ngram_counter

### ngram_counter() in my testing ran with a mean value of 87 seconds.

In [None]:
import time
start = time.time()
nlp,bi_grams_counts,sentences,target = ngram_counter()
stop = time.time()
print(stop-start)

### displaying the most common unigrams

In [None]:
freq = nltk.FreqDist(nlp)
freq.most_common(20)

### displaying the most common bigrams

In [None]:
bi_grams_counts.most_common(20)

### checking the number of words in the corpus and also checking the lexical diversity of the corpus

In [None]:
#total of around 2 million words with lexical diversity of 0.03
print("number of words: ",len(nlp))
print("lexical diversity: ",len(set(nlp)) / len(nlp))

### checking for collocation

In [None]:
nlp.collocations()

### plotting the most frequent unigrams

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.figure(figsize=(14,12))
plt.bar([i[0] for i in freq.most_common(50)],[i[1] for i in freq.most_common(50)],align = 'center',alpha = 0.5,color = 'r')
plt.xticks([i[0] for i in freq.most_common(50)],rotation='vertical')
plt.ylabel('Frequency')
plt.xlabel('Top 50 high frequency word tokens')
plt.title('Top 50 words')

### Plotting the most frequent bigrams

In [None]:
plt.figure(figsize=(14,12))
plt.bar([i[0][0] + ' ' + i[0][1] for i in bi_grams_counts.most_common(50)],[i[1] for i in bi_grams_counts.most_common(50)],align = 'center',alpha = 0.5,color = 'r')
plt.xticks([i[0][0] + ' ' + i[0][1] for i in bi_grams_counts.most_common(50)],rotation='vertical')
plt.ylabel('Frequency')
plt.xlabel('Top 50 high frequency word tokens')
plt.title('Top 50 words')

### doing a word cloud on the corpus

In [None]:
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
wordcloud = WordCloud(stopwords=stop_words,background_color='white', random_state=123).generate(" ".join(sentence for sentence in sentences))

In [None]:
print(wordcloud)
fig = plt.figure(figsize=(8,8))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

### Performing classification on the data so that we train a model such that given a sentence and its tag it is trained to classify the tag and on the test data we would be giving in the test sentence and the model will predict the tag

### And i find that these trained classifiers on this data have practical application as this can be used to automatically sort the news article into a subcategory without analyzing the entire article there by saving a lot of compute.

### And as there are 31 classes so even though the output accuracy is low as 50% i feel that is a good result and not a random values being predicted by the model  as there are 31 classes. 

### in dichotomous classification an accuracy of 50 % is like model throwing random values out but in our case 50 % accuracy means model has learnt to discriminate 50 % of the data correctly to their respective class.

### peroforming a train test split on the corpus

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(sentences, target, test_size= 0.2, random_state=123)

### Creating the TFIDF matrix using Sklearn

In [None]:
word_vectorizer = TfidfVectorizer(strip_accents='unicode',ngram_range=(1,1))

word_vectorizer.fit(x_train)
x_train_word_features = word_vectorizer.transform(x_train)

test_features = word_vectorizer.transform(x_test)

### as we would be calculating accuracy multiple times so writing an accuracy function to avoid repetation the code

In [None]:
def accuracy(model,train_pred):
    train_accuracy = accuracy_score(y_pred=train_pred,y_true=y_train)
    pred_test=model.predict(test_features)
    test_accuracy = accuracy_score(y_pred=pred_test,y_true=y_test)
    
    return train_accuracy,test_accuracy

### as the vocabulary is very high so the tfidf matrix is very high dimensional and sparse so performing matrix decompostion and choosing first 150 columns

In [None]:
svd = TruncatedSVD(n_components=150)
x_train_word_features = svd.fit_transform(x_train_word_features)
test_features = svd.transform(test_features)

### performing logistic regression on the data and checking the train and test accuracy

In [None]:
model_logistic=LogisticRegression()
model_logistic.fit(x_train_word_features,y_train)
pred=model_logistic.predict(x_train_word_features)
(tr_accu,tes_accu) = accuracy(model_logistic,pred)
print("train accuracy: ",tr_accu," test accuracy: ",tes_accu)

### building a Multi layer perceptron to predict the output using keras

In [None]:
y_train = pd.get_dummies(y_train)

In [None]:
inp = keras.layers.Input(shape=[150])
layer_1 = keras.layers.Dense(300,activation='relu')(inp)
drop1 = keras.layers.Dropout(0.5)(layer_1)
layer_2 = keras.layers.Dense(100,activation='relu')(drop1)
drop2 = keras.layers.Dropout(0.5)(layer_2)
out = keras.layers.Dense(31,activation='softmax')(drop2)
model_dnn = keras.models.Model(inputs = inp,outputs=out)
model_dnn.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model_dnn.fit(x_train_word_features,y_train,batch_size=32,epochs=4,shuffle=True,validation_split=0.1)