In [12]:
# Libraries for working on data
import numpy as np
import pandas as pd


# Libraries for preprocessing and visualisation
import re
import matplotlib.pyplot as plt
from seaborn import pairplot as pairlot_seaborn


# Libraries for building and compiling model 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, GlobalAveragePooling1D,Flatten
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dropout
from tensorflow.math import confusion_matrix as confusion_matrix_tf
from sklearn.metrics import precision_score as precision_score_tf
from sklearn.metrics import recall_score as recall_score_tf
from sklearn.metrics import f1_score as f1_score_tf


# Libraries for exporting the model
from pickle import dump as dump_pickle

# Indicator variable to see if the model is completed and ready to be exported
IS_MODEL_FINAL = 1


In [13]:
if IS_MODEL_FINAL:
    confirm_choice = input("Compiling for final model. Please confirm y/N")
    if confirm_choice.lower() == 'n':
        IS_MODEL_FINAL = 0
        
# Importing Data and some prelim preprocessing
df = pd.read_excel("Classification_Categorizations.xlsx")
df = df.dropna()
data = df[["title","m_label"]]
COL_NAME_TITLE = 'Title'
COL_NAME_LABEL = 'Category'
data.rename(columns = {'m_label':COL_NAME_LABEL}, inplace = True)
data.rename(columns = {'title':COL_NAME_TITLE}, inplace = True)
NUMBER_OF_RECORDS = data.shape[0]

# Model Constants and prelim data visualisation
CATEOGARIES = ['Avalanche', 'BoatCapsizes', 'BridgeCollapse', 'BuildingCollapse', 'CloudBurst', 'Coldwave', 'Cyclone', 'Earthquake', 'Flood', 'Forestfire', 'GasLeak', 'Hailstorm','Heatwave', 'Landslide','Oilspill','Other', 'Rainfall', 'Snowfall', 'Thunderstorm', 'TrainAccident', 'WeatherUpdate']
CATEOGARIES_TO_REMOVE = []

# Removing the data of the not required cateogaries
for cat in CATEOGARIES_TO_REMOVE:
    data = data[data[COL_NAME_LABEL] != cat]    
    CATEOGARIES.remove(cat)

CATEOGARY_COUNT = data[COL_NAME_LABEL].value_counts(normalize=False,dropna=True)
print("Number of records: " + str(NUMBER_OF_RECORDS))
print("Number of cateogaries: " + str(len(CATEOGARIES)))
print(CATEOGARY_COUNT)

# Randomising the dataset
data = data.sample(frac=1)

# Oversampling using replication technique
data_new = pd.DataFrame([])

CATEOGARIES = np.array(CATEOGARY_COUNT.index)
COUNT_MAX_CAT = max(CATEOGARY_COUNT)
data_new = pd.DataFrame([])

for cat in CATEOGARIES:
    df_temp = data[data[COL_NAME_LABEL] == cat].sample(COUNT_MAX_CAT,replace=True)
    data_new = pd.concat([data_new,df_temp],axis=0)
    
data = data.reset_index(drop=True)

Number of records: 13962
Number of cateogaries: 21
Category
Rainfall            4372
Flood               1916
WeatherUpdate       1199
Cyclone              973
Earthquake           945
Heatwave             909
Landslide            634
Coldwave             598
BuildingCollapse     320
Thunderstorm         280
Oilspill             254
Other                247
Snowfall             235
Forestfire           185
BridgeCollapse       165
Avalanche            157
GasLeak              133
TrainAccident        127
BoatCapsizes         110
Hailstorm            110
CloudBurst            93
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.rename(columns = {'m_label':COL_NAME_LABEL}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.rename(columns = {'title':COL_NAME_TITLE}, inplace = True)


In [14]:
# Function to do all the data cleaning and preprocessing. Remove all unwanted words. 
def clean_text(text):
    cities = ["kullu","ncr","ladakh","tirupati","joshimath","mysuru","Abdul","Adilabad","Adwani","Agartala","Agra","Ahmedabad","Ahmednagar","Aizawl","Ajabpur","Ajmer","Akividu","Akola","Alanallur","Alangulam","Alappuzha","Aldona","Alibag","Aligarh","Alipur","Alipur","Allahabad","Almora","Aluva","Alwar","Amal","Amalapuram","Ambad","Ambah","Ambala","Ambarnath","Ambejogai","Ambikapur","Ambur","Amer","Amet","Amravati","Amreli","Amritsar","Anand","Anantapur","Anantnag","Anantpur","Anchal","Andheri","Andra","Angadipuram","Angul","Ankleshwar","Annamalainagar","Antapur","Arakkonam","Arani","Aranmula","Arch","Ariyalur","Arora","Arpora","Arunachal","Arvi","Asansol","Assagao","Attingal","Attur","Aundh","Aurangabad","Aurangabad","Avanigadda","Azamgarh","Baddi","Badlapur","Bagalkot","Bagh","Bagpat","Bahadurgarh","Baharampur","Baidyabati","Bala","Balaghat","Balana","Balanagar","Balangir","Balasore","Bali","Bali","Ballabgarh","Balu","Balurghat","Bambolim","Banda","Bandra","Banga","Bangalore","Bangaluru","Bangaon","Bank","Banka","Bankura","Banswara","Bapatla","Barakpur","Baramati","Barddhaman","Bardoli","Bareilly","Bargarh","Barmer","Barnala","Baroda","Barpali","Barpeta","Basirhat","Basti","Basu","Batala","Bawan","Bawana","Beawar","Begusarai","Behala","Bela","Belapur","Belgaum","Belgharia","Bellare","Bellary","Bemetara","Berasia","Betalbatim","Betim","Betul","Bhadath","Bhadohi","Bhadravati","Bhagalpur","Bhagwan","Bhandari","Bhandup","Bharatpur","Bharuch","Bhatapara","Bhatinda","Bhatkal","Bhavnagar","Bhawan","Bhilai","Bhilwara","Bhimavaram","Bhiwandi","Bhiwani","Bhoj","Bhongir","Bhopal","Bhubaneswar","Bhuj","Bhusawal","Bichpuri","Bidar","Bihar Sharif","Bijapur","Bikaner","Bilaspur","Bilaspur","Bilimora","Binavas","Binnaguri","Bishnupur","Bobbili","Bodhan","Bodinayakkanur","Boisar","Bokaro","Bolpur","Botad","Brahmapur","Budaun","Budbud","Budha","Bulandshahr","Bundi","Calangute","Candolim","Canning","Caranzalem","Chakan","Chakra","Chalisgaon","Chamba","Champa","Chand","Chandan","Chandannagar","Chandauli","Chandausi","Chandigarh","Chandrapur","Changanacheri","Channapatna","Charan","Charu","Chen","Chengannur","Chennai","Chetan","Cheyyar","Chhabra","Chhachhrauli","Chhota Udepur","Chicalim","Chidambaram","Chikmagalūr","Chikodi","Chinchvad","Chintamani","Chiplun","Chirala","Chitra","Chitradurga","Chittoor","Chittur","Choolai","Chopda","Chopra","Churachandpur","Coimbatore","Colaba","Connaught Place","Coonoor","Cuddalore","Cumbum","Cuncolim","Curchorem","Cuttack","Dadri","Dahanu","Dahod","Dam Dam","Daman","Damoh","Dang","Dangi","Darbhanga","Darjeeling","Darsi","Dasna","Dasua","Davangere","Dehradun","Delhi","Deolali","Deoria","Devgarh","Devipattinam","Dewas","Dhaka","Dhamtari","Dhanbad","Dhansura","Dhar","Dharamsala","Dharapuram","Dharavi","Dhariwal","Dharmapuri","Dharwad","Dhenkanal","Dhone","Dhrol","Dhubri","Dhule","Dhuri","Dibrugarh","Dicholi","Dimapur","Dinanagar","Dindigul","Dindori","Dipas","Dogadda","Dona Paula","Dumka","Durg","Durgapur","Dwarahat","Dwarka","Edavanna","Ekkattuthangal","Ellora Caves","Eluru","Eral","Ernakulam","Erode","Etawah","Faizabad","Farakka","Faridabad","Faridkot","Fatehabad","Fatehgarh","Fatehpur","Firozabad","Firozpur","Fort","Gadag","Gampalagudem","Gandhidham","Gandhigram","Gandhinagar","Ganga","Ganganagar","Gangapur","Gangrar","Gangtok","Gannavaram","Ganpat","Gargoti","Garhshankar","Gaya","Ghana","Ghatal","Ghatkopar","Ghaziabad","Goa","Gobichettipalayam","Godhra","Gohana","Golaghat","Gold","Gonda","Gorakhpur","Goregaon","Goshaingaon","Gudivada","Gudur","Guindy","Gujrat","Gulbarga","Guna","Guntur","Gurdaspur","Gurgaon","Guruvayur","Guwahati","Gwalior","Habra","Hadadi","Haldia","Haldwani","Hamirpur","Hamirpur","Hansi","Hapur","Hari","Haridwar","Haripad","Haripur","Haryana","Hassan","Haveri","Hazaribagh","Himatnagar","Hinganghat","Hingoli","Hira","Hiriyur","Hisar","Honavar","Hong","Hoshangabad","Hoshiarpur","Hosur","Howrah","Hubli","Hugli","Hyderabad","Ichalkaranji","Idukki","Igatpuri","Iglas","Imphal","Indore","Indraprast","Irinjalakuda","Itanagar","Jabalpur","Jadabpur","Jagdalpur","Jagraon","Jaipur","Jaisalmer","Jajpur","Jalalabad","Jalalpur","Jalandhar","Jalesar","Jalgaon Jamod","Jalna","Jalpaiguri","Jamal","Jammu","Jamnagar","Jamshedpur","Janjgir","Jaspur","Jatani","Jaunpur","Jayanti","Jaynagar","Jaypur","Jha Jha","Jhajjar","Jhalawar","Jhansi","Jhargram","Jharsuguda","Jhunjhunun","Jind","Jodhpur","Jorhat","Junagadh","Kadapa","Kagal","Kailaras","Kaimganj","Kaithal","Kakdwip","Kakinada","Kaladi","Kalam","Kalamboli","Kalan","Kalinga","Kalka","Kalkaji Devi","Kalol","Kalpakkam","Kalpetta","Kalra","Kalyan","Kalyani","Kamalpur","Kamalpura","Kamat","Kanakpura","Kanchipuram","Kanchrapara","Kandi","Kangayam","Kangra","Kanhangad","Kanigiri","Kaniyambadi","Kankauli","Kanniyakumari","Kannur","Kanpur","Kapurthala Town","Karad","Karaikal","Karaikudi","Karamadai","Karamsad","Karanja","Karari","Kargil","Karimganj","Karimnagar","Karjat","Karnal","Karsiyang","Karur","Karwar","Kasal","Kasaragod","Kasganj","Kashipur","Kasia","Kataria","Kathua","Katni","Katoya","Katra","Kaul","Kavali","Kavaratti","Kayamkulam","Keshod","Khajuraho Group of Monuments","Khalapur","Khambhat","Khammam","Khan","Khanna","Kharagpur","Kharar","Khargone","Khatauli","Kheda","Khergam","Kheri","Khinwara","Khopoli","Khurda","Khurja","Kishangarh","Koch Bihar","Kochi","Kodaikanal","Kodungallur","Kohima","Kokrajhar","Kolar","Kolayat","Kolhapur","Kolkata","Kollam","Kollegal","Koni","Koni","Konnagar","Koothanallur","Koppal","Koraput","Korba","Kosamba","Kot Isa Khan","Kota","Kotian","Kottagudem","Kottakkal","Kottarakara","Kottayam","Kovilpatti","Kovvur","Kozhikode","Krishnagiri","Kulti","Kumar","Kumbakonam","Kumhari","Kundan","Kunwar","Kuppam","Kurali","Kurnool","Kushalnagar","Kuzhithurai","Ladwa","Lakhimpur","Lala","Lalgudi","Lamba Harisingh","Lanka","Latur","Liluah","Lohaghat","Lucknow","Ludhiana","Machhiwara","Machilipatnam","Madanapalle","Madgaon","Madhoganj","Madikeri","Madurai","Madurantakam","Mahabalipuram","Mahad","Mahajan","Mahal","Maharaj","Mahatma","Mahesana","Mahesh","Mahim","Mahulia","Malappuram","Maldah","Malpur","Manali","Mancherial","Mandal","Mandapeta","Mandi","Mandla","Mandsaur","Mandvi","Mandya","Mangalagiri","Mangalore","Mangaon","Manipala","Manipur","Manjeri","Manna","Mannargudi","Manor","Mansa","Manu","Markal","Markapur","Marmagao","Maru","Mashobra","Matar","Mathan","Mathura","Mattanur","Mavelikara","Mawana","Mayapur","Medak","Medarametla","Medchal","Medinipur","Meerut","Mehra","Mettur","Mhow","Mill","Miraj","Mirza Murad","Mirzapur","Mithapur","Modasa","Moga","Mohala","Mohali","Mohan","Moradabad","Morena","Morinda","Morvi","Motihari","Mount Abu","Muddanuru","Mukerian","Muktsar","Multi","Mumbai","Mundgod","Mundra","Munger","Murshidabad","Mussoorie","Muzaffarnagar","Muzaffarpur","Mylapore","Mysore","Nabadwip","Nabha","Nadgaon","Nadia","Nadiad","Nagal","Nagapattinam","Nagar","Nagara","Nagari","Nagaur","Nagercoil","Nagpur","Nagwa","Naini","Nalagarh","Nalbari","Nalgonda","Namakkal","Namrup","Nanda","Nanded","Nandi","Nandigama","Nandurbar","Nandyal","Naraina","Narasaraopet","Narayangaon","Narela","Narnaul","Narsapur","Nashik","Nathdwara","Navelim","Navsari","Nayagarh","Nazira","Nehra","Nellore","Neral","Neri","New Delhi","Neyveli","Nila","Nilambur","Nilokheri","Nizamabad","Noida","Nongpoh","Nongstoin","North Lakhimpur","Nurpur","Nuzvid","Odhan","Omalur","Ongole","Ooty","Orai","Osmanabad","Ottappalam","Pachmarhi","Padrauna","Pahalgam","Pakala","Pala","Palakkad","Palampur","Palani","Palayam","Palghar","Pali","Palladam","Paloncha","Palus","Palwal","Panchal","Panchgani","Pandharpur","Panipat","Panjim","Panruti","Pantnagar","Panvel","Paonta Sahib","Parappanangadi","Paravur","Parbhani","Parel","Parra","Patan","Patancheru","Patel","Patelguda","Pathanamthitta","Pathankot","Patiala","Patna","Pattambi","Pattukkottai","Pauri","Payyanur","Peddapuram","Pehowa","Perambalur","Peranampattu","Perundurai","Petlad","Phagwara","Phaphamau","Piduguralla","Pilani","Pileru","Pilkhuwa","Pimpri","Pitampura","Pithapuram","Pithoragarh","Pochampalli","Pollachi","Ponda","Ponnani","Ponneri","Porbandar","Port Blair","Potti","Powai","Proddatur","Puducherry","Pudukkottai","Puliyur","Punalur","Pune","Puras","Puri","Purnea","Puruliya","Pusa","Pushkar","Puttur","Puttur","Quepem","Raichur","Raigarh","Raipur","Raipur","Rajahmundry","Rajapalaiyam","Rajapur","Rajkot","Rajpur","Rajpura","Raju","Rama","Ramanagaram","Ramanathapuram","Ramapuram","Ramavaram","Ramgarh","Ramnagar","Rampur","Rana","Ranaghat","Ranchi","Rander","Raniganj","Ranippettai","Ranjan","Ratlam","Ratnagiri","Raurkela","Rawal","Raxaul","Rayagada","Rewa","Rewari","Ring","Rishikesh","Rohtak","Roorkee","Roshan","Rudrapur","Rupnagar","Rupnarayanpur","Sachin","Sagar","Sagar","Saha","Saharanpur","Sahibabad","Sakri","Sakri","Salem","Saligao","Salt Lake City","Samastipur","Sambalpur","Sanand","Sandur","Sangam","Sangamner","Sangli","Sangola","Sangrur","Sanquelim","Saranga","Sarangi","Sarwar","Satara","Satna","Sattur","Sawi","Secunderabad","Sehore","Sendhwa","Serampore","Shadnagar","Shahabad","Shahapur","Shahdara","Shahdol","Shahjahanpur","Shahkot","Shamsabad","Shanti Grama","Shillong","Shimla","Shimoga","Shirgaon","Shiv","Sholavandan","Shoranur","Shrigonda","Shyamnagar","Sibsagar","Sidhi","Sidhpur","Sikar","Sikka","Silchar","Siliguri","Silvassa","Singarayakonda","Singtam","Sinnar","Sion","Sirhind","Sirkazhi","Sirohi","Sirsa","Sirsi","Siruguppa","Siruseri","Sirwani","Sitapur","Siuri","Sivaganga","Sivakasi","Sodhi","Sojat","Solan","Solapur","Solim","Somnath","Soni","Sonipat","Sopara","Srikakulam","Srikalahasti","Srinagar","Sriperumbudur","Srirangam","Srivilliputhur","Sukma","Sultan","Sultanpur","Sultans Battery","Suman","Sunam","Sundargarh","Surana","Suratgarh","Surendranagar","Suriapet","Tadepallegudem","Tala","Talcher","Talegaon Dabhade","Talwandi Sabo","Tambaram","Tanda","Tanuku","Tarn Taran","Teri","Tezpur","Thalassery","Thane","Thanjavur","Thasra","Thenali","Thenkasi","Thirumangalam","Thiruthani","Thiruvananthapuram","Thiruvarur","Thoothukudi","Thrissur","Tikamgarh","Tindivanam","Tinsukia","Tiptur","Tiruchchendur","Tiruchi","Tirumala","Tirumala - Tirupati","Tirunelveli","Tiruppur","Tirur","Tiruvalla","Tiruvallur","Tiruvannamalai","Tohana","Tonk","Trimbak","Tuljapur","Tumkūr","Turaiyur","Udaigiri","Udaipur","Udupi","Ujjain","Ulhasnagar","Ulubari","Umred","Unnao","Uppal","Uttarkashi","Vadamadurai","Vadner","Vadodara","Vaikam","Vainguinim","Valsad","Vandalur","Vandavasi","Vaniyambadi","Vapi","Varanasi","Vasai","Vasco","Vashi","Vazhakulam","Vellore","Verna","Vidisha","Vijapur","Vijayawada","Vikarabad","Vikasnagar","Villupuram","Vinukonda","Virar","Visakhapatnam","Visnagar","Vizianagaram","Wai","Warangal","Wardha","Wellington","Yadgir","Yamunanagar","Yanam","Yavatmal","Yeola","Yercaud"]
    cities = [x.lower() for x in cities]
    cities_ = [temp+"s" for temp in cities]
    states = ["India","Andhra Pradesh","Arunachal Pradesh ","Assam","Bihar","Chhattisgarh","Goa","Gujarat","Haryana","Himachal Pradesh","Jammu and Kashmir","Jharkhand","Karnataka","Kerala","Madhya Pradesh","Maharashtra","Manipur","Meghalaya","Mizoram","Nagaland","Odisha","Punjab","Rajasthan","Sikkim","Tamil Nadu","Telangana","Tripura","Uttar Pradesh","Uttarakhand","West Bengal","Andaman and Nicobar Islands","Chandigarh","Dadra and Nagar Haveli","Daman and Diu","Lakshadweep","National Capital Territory of Delhi","Puducherry"]
    states = [x.lower() for x in states]
    states_ = [temp+"s" for temp in states]
    countries = ['afghanistan', 'aland islands', 'albania', 'algeria', 'american samoa', 'andorra', 'angola', 'anguilla', 'antarctica', 'antigua and barbuda', 'argentina', 'armenia', 'aruba', 'australia', 'austria', 'azerbaijan', 'bahamas (the)', 'bahrain', 'bangladesh', 'barbados', 'belarus', 'belgium', 'belize', 'benin', 'bermuda', 'bhutan', 'bolivia (plurinational state of)', 'bonaire, sint eustatius and saba', 'bosnia and herzegovina', 'botswana', 'bouvet island', 'brazil', 'british indian ocean territory (the)', 'brunei darussalam', 'bulgaria', 'burkina faso', 'burundi', 'cabo verde', 'cambodia', 'cameroon', 'canada', 'cayman islands (the)', 'central african republic (the)', 'chad', 'chile', 'china', 'christmas island', 'cocos (keeling) islands (the)', 'colombia', 'comoros (the)', 'congo (the democratic republic of the)', 'congo (the)', 'cook islands (the)', 'costa rica', "cote d'ivoire", 'croatia', 'cuba', 'curacao', 'cyprus', 'czechia', 'denmark', 'djibouti', 'dominica', 'dominican republic (the)', 'ecuador', 'egypt', 'el salvador', 'equatorial guinea', 'eritrea', 'estonia', 'ethiopia', 'falkland islands (the) [malvinas]', 'faroe islands (the)', 'fiji', 'finland', 'france', 'french guiana', 'french polynesia', 'french southern territories (the)', 'gabon', 'gambia (the)', 'georgia', 'germany', 'ghana', 'gibraltar', 'greece', 'greenland', 'grenada', 'guadeloupe', 'guam', 'guatemala', 'guernsey', 'guinea', 'guinea-bissau', 'guyana', 'haiti', 'heard island and mcdonald islands', 'holy see (the)', 'honduras', 'hong kong', 'hungary', 'iceland', 'india', 'indonesia', 'iran (islamic republic of)', 'iraq', 'ireland', 'isle of man', 'israel', 'italy', 'jamaica', 'japan', 'jersey', 'jordan', 'kazakhstan', 'kenya', 'kiribati', "korea (the democratic people's republic of)", 'korea (the republic of)', 'kuwait', 'kyrgyzstan', "lao people's democratic republic (the)", 'latvia', 'lebanon', 'lesotho', 'liberia', 'libya', 'liechtenstein', 'lithuania', 'luxembourg', 'macao', 'macedonia (the former yugoslav republic of)', 'madagascar', 'malawi', 'malaysia', 'maldives', 'mali', 'malta', 'marshall islands (the)', 'martinique', 'mauritania', 'mauritius', 'mayotte', 'mexico', 'micronesia (federated states of)', 'moldova (the republic of)', 'monaco', 'mongolia', 'montenegro', 'montserrat', 'morocco', 'mozambique', 'myanmar', 'namibia', 'nauru', 'nepal', 'netherlands (the)', 'new caledonia', 'new zealand', 'nicaragua', 'niger (the)', 'nigeria', 'niue', 'norfolk island', 'northern mariana islands (the)', 'norway', 'oman', 'pakistan', 'palau', 'palestine, state of', 'panama', 'papua new guinea', 'paraguay', 'peru', 'philippines (the)', 'pitcairn', 'poland', 'portugal', 'puerto rico', 'qatar', 'reunion', 'romania', 'russian federation (the)', 'rwanda', 'saint barthelemy', 'saint helena, ascension and tristan da cunha', 'saint kitts and nevis', 'saint lucia', 'saint martin (french part)', 'saint pierre and miquelon', 'saint vincent and the grenadines', 'samoa', 'san marino', 'sao tome and principe', 'saudi arabia', 'senegal', 'serbia', 'seychelles', 'sierra leone', 'singapore', 'sint maarten (dutch part)', 'slovakia', 'slovenia', 'solomon islands', 'somalia', 'south africa', 'south georgia and the south sandwich islands', 'south sudan', 'spain', 'sri lanka', 'sudan (the)', 'suriname', 'svalbard and jan mayen', 'swaziland', 'sweden', 'switzerland', 'syrian arab republic', 'taiwan (province of china)', 'tajikistan', 'tanzania, united republic of', 'thailand', 'timor-leste', 'togo', 'tokelau', 'tonga', 'trinidad and tobago', 'tunisia', 'turkey', 'turkmenistan', 'turks and caicos islands (the)', 'tuvalu', 'uganda', 'ukraine', 'united arab emirates (the)', 'united kingdom of great britain and northern ireland (the)', 'united states minor outlying islands (the)', 'united states of america (the)', 'uruguay', 'uzbekistan', 'vanuatu', 'venezuela (bolivarian republic of)', 'viet nam', 'virgin islands (british)', 'virgin islands (u.s.)', 'wallis and futuna', 'western sahara*', 'yemen', 'zambia', 'zimbabwe']
    countries = [x.lower() for x in countries]
    countries_ = [temp+"s" for temp in countries]
    STOPWORDS = ["0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2", "a3", "a4", "ab", "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "affects", "ag", "again", "against", "ah", "ain", "ain't", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't", "arise", "around", "as", "a's", "aside", "ask", "asking", "associated", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit", "cj", "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "couldnt", "couldn't", "course", "cp", "cq", "cr", "cry", "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d", "d2", "da", "date", "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't", "doing", "don", "done", "don't", "down", "downwards", "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee", "ef", "effect", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty", "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "ey", "f", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former", "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", "fy", "g", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go", "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "h2", "h3", "had", "hadn", "hadn't", "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd", "he'll", "hello", "help", "hence", "her", "here", "hereby", "herein", "heres", "here's", "hereupon", "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his", "hither", "hj", "ho", "home", "hopefully", "how", "howbeit", "however", "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "i'd", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn", "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv", "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", "ju", "just", "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known", "knows", "ko", "l", "l2", "la", "largely", "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "let's", "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "ml", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", "mustn't", "my", "myself", "n", "n2", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3", "page", "pagecount", "pages", "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "possible", "possibly", "potentially", "pp", "pq", "pr", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps", "pt", "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa", "said", "same", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't", "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically", "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t", "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "t's", "tt", "tv", "twelve", "twenty", "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want", "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereas", "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will", "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn", "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "y2", "yes", "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves", "you've", "yr", "ys", "yt", "z", "zero", "zi", "zz"]
    my_stop_words = ["met","till","continue","continues","confirm","confirms","family","families","record","social media","recorded","records","carried","carries","carry","man","women","men","boy","girls","girl","boys","woman","city","state","country","district","tehsil","area","delhis","mumbais","control","areas","hit","hits","set","students","student","cars","net","north","south","east","west","northern","southern","eastern","western","receives","received","received","coast","lie","lies"]
    months = ['january','february','march','april','may','june','july','august','september','october','november','december','jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\|@,;]\'')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) 
    text = BAD_SYMBOLS_RE.sub('', text) 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    text = ' '.join(word for word in text.split() if word not in cities)
    text = ' '.join(word for word in text.split() if word not in cities_) 
    text = ' '.join(word for word in text.split() if word not in states) 
    text = ' '.join(word for word in text.split() if word not in states_) 
    text = ' '.join(word for word in text.split() if word not in countries) 
    text = ' '.join(word for word in text.split() if word not in countries_) 
    text = ' '.join(word for word in text.split() if word not in months) 
    text = ' '.join(word for word in text.split() if word not in my_stop_words) 
    text = ''.join(i for i in text if not i.isdigit())
    return text

# Cleaning the data via the created function
data[COL_NAME_TITLE] = data[COL_NAME_TITLE].apply(clean_text)

In [15]:
# Converting news senteces to numerical vectors

# Maximum number of unique words
MAX_NB_WORDS = 50000

# Length of each vector. Zeros are prepadded in order to make the length of each vector the same. 
MAX_SEQUENCE_LENGTH = 13

# Making a tokenizer model
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)

# Training the tokenizer model on the cleaned data
tokenizer.fit_on_texts(data[COL_NAME_TITLE].values)

# So created word dictionary by the model.
word_index = tokenizer.word_index
print('Number of unique tokens: ' + str(len(word_index)))

# Creating vectors from news sentences using the tokenizer model trained above
X = tokenizer.texts_to_sequences(data[COL_NAME_TITLE].values)

# Creating each vector of the same length 
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Number of unique tokens: 6694
Shape of data tensor: (13962, 13)


In [16]:
# Converting the cateogarical data into one hot encoded data labels
Y = pd.get_dummies(data[COL_NAME_LABEL]).values
print('Shape of label tensor:', Y.shape)
N = Y.shape[1]

Shape of label tensor: (13962, 21)


In [17]:
# Data Visualisation: Pair plot using seaborn

pairplot_file = "pairPlot.png"
try:
    with open(pairplot_file, 'r') as file:  
        print("Pair plot already generated. Please see " + pairplot_file)
except:
    # Creating a copy of the cleaned data 
    X_df = pd.DataFrame(X)
    # Creating the column containing the numerical cateogary value and renaming the column to CATEOGARIES
    df_cats = pd.DataFrame(np.where(Y==1)[1])
    df_cats.columns = ["CATEOGARIES"]

    # Concating the data with CATEOGARIES column and using pairlot
    dff = pd.concat([X_df,df_cats],axis=1)
    with open(pairplot_file,'w') as file:
        file.write(pairlot_seaborn(pd.DataFrame(dff),hue="CATEOGARIES"))

Pair plot already generated. Please see pairPlot.png


In [18]:
if IS_MODEL_FINAL == 0:
    # Ratio of size of test set to train set.
    TEST_SIZE = 0.1

    # Creating train test data
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = TEST_SIZE, random_state = 42,stratify=Y)

elif IS_MODEL_FINAL == 1:
    # If the model is final, no need to create a test set. 
    X_train = X
    Y_train = Y 

print("Training Set",X_train.shape,Y_train.shape)
if IS_MODEL_FINAL == 0:
    print("Test set",X_test.shape,Y_test.shape)

Training Set (13962, 13) (13962, 21)


In [19]:
# Building the model

# Output dimension embedding layer. Input dimension will be the length of the each vector defined earlier.
EMBEDDING_DIM = 50

# Building the model
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.5))
model.add(LSTM(60, dropout=0.4, recurrent_dropout=0.4))
model.add(Dense(N, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model summary
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 13, 50)            2500000   
                                                                 
 spatial_dropout1d_1 (Spati  (None, 13, 50)            0         
 alDropout1D)                                                    
                                                                 
 lstm_1 (LSTM)               (None, 60)                26640     
                                                                 
 dense_1 (Dense)             (None, 21)                1281      
                                                                 
Total params: 2527921 (9.64 MB)
Trainable params: 2527921 (9.64 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [20]:
# Training the model

# Max number of epochs allowed for the model to be trained for
epochs = 100

# Batch size for Mini Batch SGD
batch_size = 32

# Defining callbacks to stop the model from overfitting.
callbacks = [EarlyStopping(monitor='val_loss', patience=7, min_delta=0.00001,restore_best_weights = True)]

# Training the model to fit the training data
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100


In [21]:
if IS_MODEL_FINAL == 0:
    # Evaluating model accuracy based on the test set
    accr = model.evaluate(X_test,Y_test)
    print("Test set Accuracy: " + str(accr[1]))
    print("Test set Loss: " + str(accr[0]))

    # Creating predictions for the test set to make the confusion matrix
    y_pred = model.predict(X_test)

    # Mkaing confusion matrix.
    confusion = confusion_matrix_tf(np.argmax(Y_test, axis = 1), np.argmax(y_pred, axis = 1))
    print(confusion)

    # Evaluating precision, reacll and f1-score
    precision = precision_score_tf(np.argmax(Y_test, axis = 1), np.argmax(y_pred, axis = 1),average=None)
    print("precision: ",precision)
    recall = recall_score_tf(np.argmax(Y_test, axis = 1), np.argmax(y_pred, axis = 1),average=None)
    print("recall: ",recall)
    f1score = f1_score_tf(np.argmax(Y_test, axis = 1), np.argmax(y_pred, axis = 1),average=None)
    print("f1-score: ",f1score)

    # Plotting train and test loss
    plt.title('Loss')
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')
    plt.legend()
    plt.show()

    # Plotting train and test accuracy
    plt.title('Accuracy')
    plt.plot(history.history['accuracy'], label='train')
    plt.plot(history.history['val_accuracy'], label='test')
    plt.legend()
    plt.show()

In [23]:
# Exporting the model if the final model is ready, using to a pickle file
if IS_MODEL_FINAL: 
    model_pkl_file = "singlemodel.pkl"  
    with open(model_pkl_file, 'wb') as file:  
        dump_pickle([CATEOGARIES,MAX_SEQUENCE_LENGTH,clean_text,tokenizer,model], file)

In [26]:
CATEOGARIES = np.array(sorted(CATEOGARIES))

with open('sampleNews.txt') as f_news:
    news = f_news.readlines()
text_news = news.copy()

news = [clean_text(item) for item in news]
news = tokenizer.texts_to_sequences(news)
news = pad_sequences(news, maxlen=MAX_SEQUENCE_LENGTH)

predictions = np.argmax(model.predict(news),axis=1)
y_pred = CATEOGARIES[predictions]

index = 1
for i in range(len(y_pred)): 
    print(str(index) + ". News: " + text_news[i],"Predicted Class: ",y_pred[i])
    index += 1

 Predicted Class:  Heatwave
2. News: heavy rain in Assam
 Predicted Class:  Rainfall
3. News: Heatwave days may go up this year, says IMD
 Predicted Class:  Heatwave
4. News: Heatwave forecast for Tamil Nadu this month
 Predicted Class:  Heatwave
5. News: Assam: One dead, two missing as boat capsizes in South Salmara-Mankachar district
 Predicted Class:  BoatCapsizes
6. News: Five killed, over 100 injured as cyclonic storm hits West Bengal
 Predicted Class:  Cyclone
7. News: Assam: Child dies, two go missing as single-engine boat capsizes in Brahmaputra
 Predicted Class:  BoatCapsizes
8. News: Odisha, Madhya Pradesh to get hotter
 Predicted Class:  WeatherUpdate
9. News: Assam: 4-Year-Old Child Dies, Two Missing As Boat Capsizes In Brahmaputra Amid Heavy Storm
 Predicted Class:  BoatCapsizes
10. News: Assam: Child dies, two missing as boat capsizes in Brahmaputra amid heavy rain, storm
 Predicted Class:  BoatCapsizes
11. News: Heatwave continues in Telangana, Nalgonda records 42.4 degr