## Hate Speech Detection using Machine Learning

In [49]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [50]:
print("Shape of train:", train.shape)
print("Shape of test:", test.shape)

Shape of train: (31962, 3)
Shape of test: (17197, 2)


In [51]:
train

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


In [52]:
test

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."
...,...,...
17192,49155,thought factory: left-right polarisation! #tru...
17193,49156,feeling like a mermaid ð #hairflip #neverre...
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...
17195,49158,"happy, at work conference: right mindset leads..."


In [53]:
import re

def clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    return df

# tweet is the column
test_clean = clean_text(test, "tweet")
train_clean = clean_text(train, "tweet")

In [54]:
test_clean

Unnamed: 0,id,tweet
0,31963,studiolife aislife requires passion dedication...
1,31964,white supremacists want everyone to see the ...
2,31965,safe ways to heal your acne altwaystoheal h...
3,31966,is the hp and the cursed child book up for res...
4,31967,3rd bihday to my amazing hilarious nephew el...
...,...,...
17192,49155,thought factory leftright polarisation trump u...
17193,49156,feeling like a mermaid hairflip neverready fo...
17194,49157,hillary campaigned today in ohioomg amp used w...
17195,49158,happy at work conference right mindset leads t...


In [55]:
train_clean

Unnamed: 0,id,label,tweet
0,1,0,when a father is dysfunctional and is so sel...
1,2,0,thanks for lyft credit i cant use cause they...
2,3,0,bihday your majesty
3,4,0,model i love u take with u all the time in u...
4,5,0,factsguide society now motivation
...,...,...,...
31957,31958,0,ate isz that youuu
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,sikh temple vandalised in in calgary wso cond...


In [56]:
(train['label'] == 0).sum()

29720

In [57]:
(train['label'] == 1).sum()

2242

Since there is an imbalance to the label, we will use resample

In [58]:
#We will balance the data using sklearn.utils

from sklearn.utils import resample

train_majority = train_clean[train_clean.label==0]
train_minority = train_clean[train_clean.label==1]
train_minority_upsampled = resample(train_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(train_majority),    # to match majority class
                                 random_state=123) # reproducible results

train_upsampled = pd.concat([train_minority_upsampled, train_majority])


In [59]:
train_upsampled['label'].value_counts()

label
1    29720
0    29720
Name: count, dtype: int64

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier

In [61]:
pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', SGDClassifier()),
])

In [62]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_upsampled['tweet'], train_upsampled['label'], random_state = 0)

In [63]:
model = pipeline_sgd.fit(X_train, y_train)
y_predict = model.predict(X_test)

from sklearn.metrics import f1_score
f1_score(y_test, y_predict)

0.9698546085100707

In [64]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy = accuracy_score(y_test, y_predict)
precision = precision_score(y_test, y_predict)
recall = recall_score(y_test, y_predict)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.9695827725437416
Precision: 0.9536988457502623
Recall: 0.9865671641791045


In [1]:
import nltk
# # nltk.download('all')
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from ast import literal_eval

In [2]:
data = pd.read_csv('Hotel_Reviews.csv')
data.head()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficul...,42,1403,Location was good and staff were ok It is cut...,21,9,7.1,"[' Leisure trip ', ' Family with young childre...",3 days,52.360576,4.915968
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,210,1403,Great location in nice surroundings the bar a...,26,1,3.8,"[' Leisure trip ', ' Solo traveler ', ' Duplex...",3 days,52.360576,4.915968
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/24/2017,7.7,Hotel Arena,New Zealand,You When I booked with your company on line y...,140,1403,Amazing location and building Romantic setting,8,3,6.7,"[' Leisure trip ', ' Couple ', ' Suite ', ' St...",10 days,52.360576,4.915968


In [3]:
# Replacing "United Kingdom with "UK"
data.Hotel_Address = data.Hotel_Address.str.replace("United Kingdom", "UK")

# Now I will split the address and pick the last word in the address to identify the country
data['countries'] = data.Hotel_Address.apply(lambda x: x.split()[-1])
print(data.countries.value_counts())
print(data.countries.unique())

countries
UK             262301
Spain           60149
France          59928
Netherlands     57214
Austria         38939
Italy           37207
Name: count, dtype: int64
['Netherlands' 'UK' 'France' 'Spain' 'Italy' 'Austria']


In [4]:
def impute(column):
    column = column[0]
    if (type(column) != list):
        return "".join(literal_eval(column))
    else:
        return column
    
data['Tags'] = data[['Tags']].apply(impute, axis=1)
data.head()

  column = column[0]


Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng,countries
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,Leisure trip Couple Duplex Double Room Sta...,0 days,52.360576,4.915968,Netherlands
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,Leisure trip Couple Duplex Double Room Sta...,0 days,52.360576,4.915968,Netherlands
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficul...,42,1403,Location was good and staff were ok It is cut...,21,9,7.1,Leisure trip Family with young children Dup...,3 days,52.360576,4.915968,Netherlands
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,210,1403,Great location in nice surroundings the bar a...,26,1,3.8,Leisure trip Solo traveler Duplex Double Ro...,3 days,52.360576,4.915968,Netherlands
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/24/2017,7.7,Hotel Arena,New Zealand,You When I booked with your company on line y...,140,1403,Amazing location and building Romantic setting,8,3,6.7,Leisure trip Couple Suite Stayed 2 nights ...,10 days,52.360576,4.915968,Netherlands


In [5]:
data['countries'] = data['countries'].str.lower()
data['Tags'] = data['Tags'].str.lower()

In [6]:
def recommend_hotel(location, description):
    description = description.lower()
    word_tokenize(description)
    stop_words = set(stopwords.words('english'))
    lemm = WordNetLemmatizer()
    filtered = {word for word in description if not word in stop_words}
    filtered_set = set()
    for fs in filtered:
        filtered_set.add(lemm.lemmatize(fs))

    country = data[data['countries'] == location.lower()]
    country = country.set_index(np.arange(country.shape[0]))
    list1 = []; list2 = []; cos = [];
    for i in range(country.shape[0]):
        temp_token = word_tokenize(country['Tags'][i])
        temp_set = [word for word in temp_token if not word in stop_words]
        temp2_set = set()
        for s in temp_set:
            temp2_set.add(lemm.lemmatize(s))
        vector = temp2_set.intersection(filtered_set)
        cos.append(len(vector))
    country['similarity'] = cos 
    country = country.sort_values(by='similarity', ascending=False)
    country.drop_duplicates(subset='Hotel_Name', keep='first', inplace=True)
    country.sort_values('Average_Score', ascending=False, inplace=True)
    country.reset_index(inplace=True)
    return country [["Hotel_Name", "Average_Score", "Hotel_Address"]].head()
    

In [7]:
recommend_hotel('UK', 'I am going on a honeymoon, I need a honeymoon suite room for 3 nights')

Unnamed: 0,Hotel_Name,Average_Score,Hotel_Address
0,Haymarket Hotel,9.6,1 Suffolk Place Westminster Borough London SW1...
1,41,9.6,41 Buckingham Palace Road Westminster Borough ...
2,Taj 51 Buckingham Gate Suites and Residences,9.5,Buckingham Gate Westminster Borough London SW1...
3,Charlotte Street Hotel,9.5,15 17 Charlotte Street Hotel Westminster Borou...
4,Ham Yard Hotel,9.5,One Ham Yard Westminster Borough London W1D 7D...


In [78]:
recommend_hotel('Philippines', 'Hotel with a bidet')

Unnamed: 0,Hotel_Name,Average_Score,Hotel_Address


In [79]:
recommend_hotel('UK', 'I am going on a honeymoon')

Unnamed: 0,Hotel_Name,Average_Score,Hotel_Address
0,41,9.6,41 Buckingham Palace Road Westminster Borough ...
1,Haymarket Hotel,9.6,1 Suffolk Place Westminster Borough London SW1...
2,Taj 51 Buckingham Gate Suites and Residences,9.5,Buckingham Gate Westminster Borough London SW1...
3,Ham Yard Hotel,9.5,One Ham Yard Westminster Borough London W1D 7D...
4,Milestone Hotel Kensington,9.5,1 Kensington Court Kensington and Chelsea Lond...


In [80]:
recommend_hotel('Italy', 'I am going to eat a lot of pizza')

Unnamed: 0,Hotel_Name,Average_Score,Hotel_Address
0,Excelsior Hotel Gallia Luxury Collection Hotel,9.4,Piazza Duca D Aosta 9 Central Station 20124 Mi...
1,Palazzo Parigi Hotel Grand Spa Milano,9.3,Corso Di Porta Nuova 1 Milan City Center 20121...
2,Hotel Spadari Al Duomo,9.3,Via Spadari 11 Milan City Center 20123 Milan I...
3,Room Mate Giulia,9.3,Silvio Pellico 4 Milan City Center 20121 Milan...
4,UNA Maison Milano,9.3,Via Mazzini 4 Milan City Center 20123 Milan Italy
