# Recurrent neural networks (RNN)

In [123]:
#spacy works for python 3.8.10
import spacy
#!python3.8 -m spacy download en_core_web_sm

import string
import numpy as np
import pandas as pd

from gensim.models import Word2Vec
from gensim import downloader

#modelling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional, Embedding
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau

#saving model
import pickle

### Functions and global vars definition

In [124]:
DEVELOPING = True
MAX_LENGTH = 80

In [125]:
nlp = spacy.load("en_core_web_sm")#english tokenizer, tagger, parser and NER
def split_sentences(text):
    aux_sentence = nlp(text)
    return [str(token.lemma_).lower() for token in aux_sentence if not (token.is_stop or \
                                                                token.is_punct or \
                                                                len(token)<3)]

In [126]:
data_train = pd.read_csv("clean_df_train.csv")
data_test = pd.read_csv("clean_df_test.csv")

In [127]:
#subsampling for developing faster
if DEVELOPING:
    data_train = data_train.groupby('label').apply(lambda x: x.sample(frac=0.03)).reset_index(drop=True)
    data_test = data_test.groupby('label').apply(lambda x: x.sample(frac=0.03)).reset_index(drop=True)

## preprocessing

### Analyzing length of each sentence

With a length of input of 80 words we are only not completely getting approximatelly a 5% of the data (train and test)

In [128]:
data_train["words_count"] = data_train.apply(lambda row: len(row["content"].split()), axis = 1)
data_test["words_count"] = data_test.apply(lambda row: len(row["content"].split()), axis = 1)

In [129]:
data_train[data_train["words_count"] > 80]

Unnamed: 0,label,title,content,words_count
20,0.0,Bridge 9 Records,Bridge Nine Records is a contemporary hardcor...,84
28,0.0,Focus DIY,Focus DIY was a privately owned chain of DIY ...,82
31,0.0,Buhl Aircraft Company,The Buhl Aircraft Company was founded in 1925...,83
72,0.0,B+H Ocean Carriers,B+H Ocean Carriers Ltd. (NYSE MKT: BHO) is an...,82
106,0.0,BBC Studios and Post Production,BBC Studios and Post Production is a commerci...,96
...,...,...,...,...
11484,13.0,Zero Hour: Crisis in Time,Zero Hour: Crisis in Time! is a five-issue co...,82
11494,13.0,The Greatest Show on Earth: The Evidence for E...,The Greatest Show on Earth: The Evidence for ...,94
11496,13.0,Birpurush (poem),Birpurush is a poem written by Rabindranath T...,85
11497,13.0,Lord Emsworth and Others,Lord Emsworth and Others is a collection of n...,84


In [130]:
data_test[data_test["words_count"] > 80]

Unnamed: 0,label,title,content,words_count
23,0,GoCar (carsharing),GoCar is the first Carsharing service in Irel...,84
32,0,Ideas Campaign,The Ideas Campaign is a grassroots initiative...,82
156,1,University of Iowa College of Law,The University of Iowa College of Law is one ...,89
157,1,Stillwater Area High School,Stillwater Area High School (SAHS) is a publi...,81
178,1,Gehlen Catholic High School,Gehlen Catholic School is located in LeMars I...,81
...,...,...,...,...
1370,13,The Saint in Miami,The Saint in Miami is the title of a mystery ...,86
1408,13,Funtastic Journey,Funtastic Journey was a comic strip that star...,81
1418,13,Liars and Outliers,Liars and Outliers: Enabling the Trust that S...,105
1427,13,Despite the Falling Snow,Despite the Falling Snow is a 2004 book by Sh...,81


In [131]:
#parallelizing (Not sure it works correctly. Maybe mac hardware is not good for this)
import dask.dataframe as dd
from dask.multiprocessing import get

ddata = dd.from_pandas(data_train, npartitions=30)

#data_train[:1000].apply(lambda row: split_sentences(row["content"]), axis = 1)
ddata.map_partitions(lambda df: df.apply((lambda row: split_sentences(row["content"])), axis=1)).compute(scheduler="threads")

KeyboardInterrupt: 

In [132]:
data_train["preprocessed_content"] = data_train.apply(lambda row: split_sentences(row["content"]), axis = 1)
data_test["preprocessed_content"] = data_test.apply(lambda row: split_sentences(row["content"]), axis = 1)

In [135]:
data_train.head()

Unnamed: 0,label,title,content,words_count,preprocessed_content
0,0.0,Rainstor,RainStor is a software company that provides ...,72,"[rainstor, software, company, provide, databas..."
1,0.0,GRGDN,GRGDN is a Turkish music production and artis...,61,"[grgdn, turkish, music, production, artist, ma..."
2,0.0,Oregon Economic and Community Development Depa...,The Oregon Business Development Department (O...,16,"[oregon, business, development, department, ob..."
3,0.0,Club Air,Club Air was an airline based in Verona Italy...,32,"[club, air, airline, base, verona, italy, oper..."
4,0.0,Stearns (automobile),F. B. Stearns and Company (later F.B. Stearns...,27,"[stearns, company, later, f.b., stearns, compa..."


In [138]:
data_test.head()

Unnamed: 0,label,title,content,words_count,preprocessed_content
0,0,Collectors Universe,Collectors Universe Inc. (NASDAQ:CLCT) was se...,49,"[collectors, universe, inc., nasdaq, clct, set..."
1,0,Ameritest,Ameritest is an international advertising res...,67,"[ameritest, international, advertising, resear..."
2,0,SicolaMartin,Y&R Austin is an integrated advertising and m...,30,"[y&r, austin, integrated, advertising, marketi..."
3,0,MarkMonitor,MarkMonitor Inc. is an American software comp...,58,"[markmonitor, inc., american, software, compan..."
4,0,Anteros Coachworks Inc.,Anteros Coachworks Inc. is an American sports...,25,"[anteros, coachworks, inc., american, sport, c..."


## Tokenizer

In [148]:
def encoding_data(data_train,data_test,max_length):
    """This function creates the tokenizer, fits it to the data, tokenizes and padds train and test dataframes,
    saves the fitted tokenizer into a file, and finally returns train and test dataframes with text encoded as well as 
    the labels one hot encoded and the word_index dictionary"""
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data_train["preprocessed_content"])

    # word_index is a dictionary contains indeces for the words like {'for': 8, 'is': 9, 'me': 10, ...}
    word_index = tokenizer.word_index

    # train_sequences is vectors where each vector represents a sentence
    train_sequences = tokenizer.texts_to_sequences(data_train["preprocessed_content"])
    test_sequences = tokenizer.texts_to_sequences(data_test["preprocessed_content"])

    train_sequences = pad_sequences(train_sequences, maxlen=max_length, padding="pre")
    test_sequences = pad_sequences(test_sequences, maxlen=max_length, padding="pre")

    # Convert our labels into one-hot encoded
    onehot_encoder = OneHotEncoder(sparse=False)
    onehot_encoder.fit(np.array(data_train["label"]).reshape(-1, 1))

    train_labels = onehot_encoder.transform(np.array(data_train["label"]).reshape(-1, 1))
    test_labels = onehot_encoder.transform(np.array(data_test["label"]).reshape(-1, 1))


    # saving
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


    return train_sequences, test_sequences, train_labels, test_labels, word_index

train_sequences, test_sequences, train_labels, test_labels, word_index = encoding_data(data_train, data_test, MAX_LENGTH)

In [147]:
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoder.fit(np.array(data_train["label"]).reshape(-1, 1))
onehot_encoder.transform(np.array(data_train["label"]).reshape(-1, 1))

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [76]:
tokenizer.word_index

{'company': 1,
 'base': 2,
 'found': 3,
 'service': 4,
 'label': 5,
 'own': 6,
 'operate': 7,
 'american': 8,
 'united': 9,
 'new': 10,
 'group': 11,
 'product': 12,
 'business': 13,
 'headquarter': 14,
 'inc.': 15,
 'include': 16,
 'know': 17,
 'large': 18,
 'record': 19,
 'software': 20,
 'music': 21,
 'corporation': 22,
 'provide': 23,
 'international': 24,
 'records': 25,
 'states': 26,
 'game': 27,
 'store': 28,
 'bank': 29,
 'firm': 30,
 'independent': 31,
 'release': 32,
 'limited': 33,
 'office': 34,
 'city': 35,
 'world': 36,
 'manufacturer': 37,
 'locate': 38,
 'brand': 39,
 'development': 40,
 'system': 41,
 'establish': 42,
 'market': 43,
 'film': 44,
 'produce': 45,
 'california': 46,
 'technology': 47,
 'specialize': 48,
 'airline': 49,
 'state': 50,
 'subsidiary': 51,
 'form': 52,
 'design': 53,
 'entertainment': 54,
 'british': 55,
 'operation': 56,
 'country': 57,
 'private': 58,
 'create': 59,
 'south': 60,
 'bus': 61,
 'industry': 62,
 'video': 63,
 'start': 64,
 'pr

In [None]:
# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

### Word2vec