In [1]:
import os
import pandas as pd
import numpy as np
import re
import sklearn
import pickle
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
pd.options.display.max_colwidth = 100
pd.set_option('mode.chained_assignment', None)

In [2]:
sklearn.__version__

'0.23.1'

# Language detection model fitting

In [4]:
#input file path
path = "C:\\Users\\Thanis\\Desktop\\Data Science\\Project\\multi lang v1\\Data\\"
#output pickle file path
saved_path = 'C:\\Users\\Thanis\\Desktop\\Data Science\\Project\\multi lang v1\\Saved model\\'

In [2]:
#concatinating all the files

files= [file for file in os.listdir(path)]
all_data= pd.DataFrame()

for file in files:
    df= pd.read_csv(path+file)
    all_data= pd.concat([all_data, df])

In [3]:
all_data.lang.value_counts()

TU    31277
ID    13169
GR     8743
HI     4665
FR     4014
en     3967
AR     3353
DA     2960
Name: lang, dtype: int64

In [4]:
#creating samples
tu_train = all_data[all_data.lang == 'TU'].sample(2000)
id_train = all_data[all_data.lang == 'ID'].sample(2000)
gr_train = all_data[all_data.lang == 'GR'].sample(2000)
hi_train = all_data[all_data.lang == 'HI'].sample(2000)
fr_train = all_data[all_data.lang == 'FR'].sample(2000)
en_train = all_data[all_data.lang == 'en'].sample(2000)
ar_train = all_data[all_data.lang == 'AR'].sample(2000)
da_train = all_data[all_data.lang == 'DA'].sample(2000)

In [5]:
#concatenation of the sampled data 
list_train = [tu_train,id_train,gr_train,hi_train,fr_train,en_train,ar_train,da_train]
train = pd.DataFrame()
for t in list_train:
    train= pd.concat([train, t])

In [6]:
train = train.reset_index(drop = True)

In [7]:
for index, row in train.iterrows():
    train.loc[index ,"tweet"] = re.sub(r'@\w+', '', train.loc[index ,"tweet"])

In [8]:
#X and Y split
train_X = train['tweet']
train_y = train['lang']

In [9]:
def train_lang_transform(train_X, tfidf_vectorizer):
    
    tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_X)
    tfidf_matrix_train = tfidf_matrix_train.todense()
    vocabulary = tfidf_vectorizer.get_feature_names()
    
    return pd.DataFrame(data=tfidf_matrix_train, columns=vocabulary).iloc[:,0::2]

In [11]:
#creating vectors
tfidf_lang_vectorizer = TfidfVectorizer(analyzer = u'word',ngram_range = (1,1),max_features=20000)
train_X = train_lang_transform(train_X, tfidf_lang_vectorizer)

In [12]:
#fitting model
mnb_lang = MultinomialNB(alpha = 0.0005)
lang_fitted = mnb_lang.fit(train_X, train_y)

In [13]:
#saving the tfidf fitted model as pickle file
filename = saved_path + 'vectorized_lang_model.sav'
pickle.dump(tfidf_lang_vectorizer, open(filename, 'wb'))

In [14]:
#saving the fitted mode as a pickle file
filename = saved_path + 'finalized_lang_model.sav'
pickle.dump(lang_fitted, open(filename, 'wb'))

# hatespeech detection model fitting

In [5]:
#Importing Training Dataset
ar = pd.read_csv(path + "arabic_dataset_modified.csv")
da = pd.read_csv(path + "danish_dataset_modified.csv")
hi = pd.read_csv(path + "hindi_dataset_modified.csv")
tu = pd.read_csv(path + "turk_dataset_modified.csv")
id = pd.read_csv(path + "indo_dataset_modified.csv",engine='python')
gr = pd.read_csv(path + "greek_dataset_modified.csv")
fr = pd.read_csv(path + "french_dataset_modified.csv")
en = pd.read_csv(path + "english_dataset_modified.csv")
#en = pd.read_csv(path + "english_dataset_modified_1.csv")

In [6]:
def train_transform(train_X, tfidf_vectorizer):
    
    tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_X)
    tfidf_matrix_train = tfidf_matrix_train.todense()
    vocabulary = tfidf_vectorizer.get_feature_names()
    
    return pd.DataFrame(data=tfidf_matrix_train, columns=vocabulary).iloc[:,0::2]

# English

In [7]:
train_en_X = en['tweet']
train_en_y = en['label']

tfidf_en_vectorizer = TfidfVectorizer(analyzer = u'word',ngram_range = (1,3))
train_en_X = train_transform(train_en_X, tfidf_en_vectorizer)

#SVM
svm_en=LinearSVC(C=5)
en_fitted = svm_en.fit(train_en_X, train_en_y)

In [8]:
filename = saved_path + 'vectorized_en_model.sav'
pickle.dump(tfidf_en_vectorizer, open(filename, 'wb'))

In [9]:
filename = saved_path + 'finalized_en_model.sav'
pickle.dump(en_fitted, open(filename, 'wb'))

# French

In [21]:
train_fr_X = fr['tweet']
train_fr_y = fr['label']

tfidf_fr_vectorizer = TfidfVectorizer(analyzer = u'word',ngram_range = (1,3))
train_fr_X = train_transform(train_fr_X, tfidf_fr_vectorizer)

#SVM
svm_fr=LinearSVC(C=5)
fr_fitted = svm_fr.fit(train_fr_X, train_fr_y)

In [22]:
filename = saved_path + 'vectorized_fr_model.sav'
pickle.dump(tfidf_fr_vectorizer, open(filename, 'wb'))

In [23]:
filename = saved_path + 'finalized_fr_model.sav'
pickle.dump(fr_fitted, open(filename, 'wb'))

# Arabic

In [24]:
train_ar_X = ar['tweet']
train_ar_y = ar['label']

tfidf_ar_vectorizer = TfidfVectorizer(analyzer = u'word',ngram_range = (1,3))
train_ar_X = train_transform(train_ar_X, tfidf_ar_vectorizer)

#SVM
svm_ar=LinearSVC(C=5)
ar_fitted = svm_ar.fit(train_ar_X, train_ar_y)

In [25]:
filename = saved_path + 'vectorized_ar_model.sav'
pickle.dump(tfidf_ar_vectorizer, open(filename, 'wb'))

In [26]:
filename = saved_path + 'finalized_ar_model.sav'
pickle.dump(ar_fitted, open(filename, 'wb'))

# Indo

In [None]:
id1 = id.sample(5000)

In [None]:
train_in_X = id1['tweet']
train_in_y = id1['label']

tfidf_in_vectorizer = TfidfVectorizer(analyzer = u'word',ngram_range = (1,3))
train_in_X = train_transform(train_in_X, tfidf_in_vectorizer)

#SVM
svm_in=LinearSVC(C=5)
in_fitted = svm_in.fit(train_in_X, train_in_y)

In [None]:
filename = saved_path + 'vectorized_in_model.sav'
pickle.dump(tfidf_in_vectorizer, open(filename, 'wb'))

In [None]:
filename = saved_path + 'finalized_in_model.sav'
pickle.dump(in_fitted, open(filename, 'wb'))

# Danish

In [None]:
train_da_X = da['tweet']
train_da_y = da['label']

tfidf_da_vectorizer = TfidfVectorizer(analyzer = u'word',ngram_range = (1,3))
train_da_X = train_transform(train_da_X, tfidf_da_vectorizer)

#bagging
bag_da=BaggingClassifier(n_estimators=100)
da_fitted = bag_da.fit(train_da_X, train_da_y)

In [None]:
filename = saved_path + 'vectorized_da_model.sav'
pickle.dump(tfidf_da_vectorizer, open(filename, 'wb'))

In [None]:
filename = saved_path + 'finalized_da_model.sav'
pickle.dump(da_fitted, open(filename, 'wb'))

# Hindi

In [27]:
train_hi_X = hi['tweet']
train_hi_y = hi['label']

tfidf_hi_vectorizer = TfidfVectorizer(analyzer = u'word',ngram_range = (1,3))
train_hi_X = train_transform(train_hi_X, tfidf_hi_vectorizer)

#random forest
rf_hi=RandomForestClassifier(n_estimators=100)
hi_fitted = rf_hi.fit(train_hi_X, train_hi_y)

In [28]:
filename = saved_path + 'vectorized_hi_model.sav'
pickle.dump(tfidf_hi_vectorizer, open(filename, 'wb'))

In [29]:
filename = saved_path + 'finalized_hi_model.sav'
pickle.dump(hi_fitted, open(filename, 'wb'))

# Turkey

In [None]:
tu1 = tu.sample(5000)

In [None]:
train_tu_X = tu1['tweet']
train_tu_y = tu1['label']

tfidf_tu_vectorizer = TfidfVectorizer(analyzer = u'word',ngram_range = (1,3))
train_tu_X = train_transform(train_tu_X, tfidf_tu_vectorizer)

#SVM
svm_tu =LinearSVC(C=5)
tu_fitted = svm_tu.fit(train_tu_X, train_tu_y)

In [None]:
filename = saved_path + 'vectorized_tu_model.sav'
pickle.dump(tfidf_tu_vectorizer, open(filename, 'wb'))

In [None]:
filename = saved_path + 'finalized_tu_model.sav'
pickle.dump(tu_fitted, open(filename, 'wb'))

# Greek

In [None]:
gr1 = gr.sample(5000)

In [None]:
train_gr_X = gr1['tweet']
train_gr_y = gr1['label']

tfidf_gr_vectorizer = TfidfVectorizer(analyzer = u'word',ngram_range = (1,3))
train_gr_X = train_transform(train_gr_X, tfidf_gr_vectorizer)

#SVM
svm_gr =LinearSVC(C=5)
gr_fitted = svm_gr.fit(train_gr_X, train_gr_y)

In [None]:
filename = saved_path + 'vectorized_gr_model.sav'
pickle.dump(tfidf_gr_vectorizer, open(filename, 'wb'))

In [None]:
filename = saved_path + 'finalized_gr_model.sav'
pickle.dump(gr_fitted, open(filename, 'wb'))