# <center> News Classification with NLP and Neural Networks</center>

### Imports

In [1]:
import warnings
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn import preprocessing
import plotly.graph_objects as go
warnings.filterwarnings('ignore')

### Data
- Import JSON file
- Limit to top categories

In [21]:
# Read in json file to DataFrame
df = pd.read_csv('wiki_movie_plots_deduped.csv')

# Only keep relevant data
df = df.drop(columns=['Title','Origin/Ethnicity','Wiki Page','Cast','Director'])

# Format column names for sanity
df.columns = [i.replace(" ","_").lower() for i in df.columns]

# View Results
print(df.shape)
df.sample(5)

(34886, 3)


Unnamed: 0,release_year,genre,plot
12667,1996,action,A group of terrorists led by Luther (Jeff Kobe...
31133,2012,drama,Sundarapandian (Sasi Kumar) is the only son of...
17853,2010,war,The great film begins with a video log by Elli...
21752,1980,drama,The Hounds of Notre Dame is about 36 hours in ...
22682,2013,romance / drama,"After graduation, the girls gathered together ..."


# Cleaning

#### Check data types

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34886 entries, 0 to 34885
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   release_year  34886 non-null  int64 
 1   genre         34886 non-null  object
 2   plot          34886 non-null  object
dtypes: int64(1), object(2)
memory usage: 817.8+ KB


#### Check NaNs 

In [11]:
df.isna().sum()

release_year    0
genre           0
plot            0
dtype: int64

#### Check Duplicates

In [22]:
print(f"There are {df.duplicated().sum()} duplicated rows")

# Drop duplicates
df = df.drop_duplicates()

print(f"There are now {df.duplicated().sum()} duplicated rows")

There are 138 duplicated rows
There are now 0 duplicated rows


### Genre Aggregation

In [23]:
df['genre'] = df['genre'].apply(lambda x: x.replace(" ",",").replace("/",",").replace("-",","))

In [24]:
df['genre'].value_counts().head(50)

unknown             6012
drama               5950
comedy              4376
horror              1160
action              1091
thriller             964
romance              921
western              862
crime                566
adventure            525
crime,drama          480
romantic,comedy      474
musical              466
science,fiction      431
film,noir            344
mystery              310
comedy,drama         302
war                  273
animation            264
comedy,,drama        235
sci,fi               221
family               217
fantasy              203
animated             195
musical,comedy       163
biography            136
anime                110
suspense             104
romantic,drama       102
action,thriller       93
animated,short        91
drama,,romance        86
social                82
historical            77
documentary           73
action,comedy         72
serial                71
world,war,ii          70
war,drama             70
crime,thriller        68


In [31]:
for genre in df['genre'].value_counts().keys():
    for g in genre.split(","):
        if g in df['genre'].value_counts().keys():
            df['genre'].map({g:genre})

In [32]:
df['genre'].value_counts().head(50)

unknown             6012
drama               5950
comedy              4376
horror              1160
action              1091
thriller             964
romance              921
western              862
crime                566
adventure            525
crime,drama          480
romantic,comedy      474
musical              466
science,fiction      431
film,noir            344
mystery              310
comedy,drama         302
war                  273
animation            264
comedy,,drama        235
sci,fi               221
family               217
fantasy              203
animated             195
musical,comedy       163
biography            136
anime                110
suspense             104
romantic,drama       102
action,thriller       93
animated,short        91
drama,,romance        86
social                82
historical            77
documentary           73
action,comedy         72
serial                71
world,war,ii          70
war,drama             70
crime,thriller        68


In [None]:
# Dictionary to map aliases
genre_alias_dict = {'sci-fi','science fiction'}

# <center> -------------------------------------------------------------------- </center>

# EDA

#### View categories

#### View length of texts

### Author Activity

# <center> -------------------------------------------------------------------- </center>

# Preprocessing Text Data
- Stop words
- Stem / Lemmatize text
- Tokenization
- Use all words or just most frequent?
- Use bigrams, POS taggins, Mutual information Scores?
- What sort of vectorization? (Boolean / Count / TF-IDF / Word2Vec)

In [None]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer

### Encode Target Variable
- Assign a unique value to each categories

In [None]:
# Instantiate label encoder 
le = preprocessing.LabelEncoder()

# Label Encode categories
df['class_label'] = le.fit_transform(df['category'])

# Create dictionary to map labels to categories
label_to_category = dict(df.groupby('class_label')['category'].first())

### Get Stopwords

In [None]:
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']

### Test Train Split

In [None]:
from sklearn.model_selection import train_test_split

# Define dependent and independent variables
X = df["text"]
y = df["class_label"]

# Perform Test Train Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=8)

### Count Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate CountVectorizer with custom list of stop words
count_vectorizer = CountVectorizer(stop_words= stopwords_list)

# Fit the tokenizer on the training data
count_vectorizer.fit(X_train)

# Apply the tokenizer to the training and testing data
X_train_counts = count_vectorizer.transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)
X_val_counts = count_vectorizer.transform(X_val)

# View Results
print(f'There are {X_train_counts.shape[0]} observations and {X_train_counts.shape[1]} features')
pd.DataFrame(data= X_val_counts[:5].toarray(), columns = count_vectorizer.get_feature_names())

## Model: Neural Network

In [None]:
from keras import models
from keras import layers
from keras import optimizers
from keras.utils.np_utils import to_categorical

### OHE target

In [None]:
from keras.utils.np_utils import to_categorical

# OHE for netural network
y_train_ohe = to_categorical(y_train)
y_val_ohe = to_categorical(y_val)

### Create model
- Fully connected (dense) layer network with relu activation
- 2 hidden layers with 50 units in 1st and 25 in second
- Softmax classifier for nulticlass problem

In [None]:
# Initialize a sequential model
model = models.Sequential()

# One layer with softmax activation 
model.add(layers.Dense(41, activation='softmax'))

### Compiling the model
- Loss function = categorical crossentropy
- Optimizer = stochastic gradient descent

In [None]:
model.compile(optimizer='SGD',
              loss='categorical_crossentropy',
              metrics=['acc'])

In [None]:
# Train the model 
n_epochs = 10
history = model.fit(X_train_counts, y_train_ohe, epochs= n_epochs, validation_data = (X_val_counts, y_val_ohe))

### Evaluation

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def get_model_metrics(actuals, predictions, full_report = False, plot = False):  

    predictions_bool = np.argmax(predictions, axis=1)
    
    precision = round(precision_score(actuals, predictions_bool, average='weighted'),3)
    recall = round(recall_score(actuals, predictions_bool, average='weighted'),3)
    f1 = round(f1_score(actuals, predictions_bool, average='weighted'),3)
    accuracy = round(accuracy_score(actuals, predictions_bool),3)
    
    if full_report == False:
        print(f"Accuracy = {accuracy} \nPrecision = {precision} \nRecall = {recall} \nf1 = {f1}")
    else:
        print(classification_report(actuals, predictions_bool))
    if plot == True:
        fig = go.Figure()

        fig.add_trace(go.Scatter(x=[i for i in range(20)], y=history.history['acc'],
                            mode='lines+markers', name='Train Accuracy'))
        fig.add_trace(go.Scatter(x=[i for i in range(20)], y=history.history['val_acc'],
                            mode='lines+markers', name='Validation Accuracy'))

        fig.update_layout(height= 500, width= 700)

        fig.show()

In [None]:
get_model_metrics(y_val, model.predict(X_val_counts), full_report = 0, plot = 1)