# **Detectar insultos en redes sociales.**


---



In [32]:
import re, string
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier


**Load raw data**


---



In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
%cd '/content/drive/My Drive/Colab Notebooks/db'
!ls

/content/drive/My Drive/Colab Notebooks/db
insults.csv    london_merged.csv  Mall_Customers.csv	  titanic.csv
insurance.csv  LR_ML.xlsx	  Meteorite_Landings.csv


In [5]:
def clean_text(text):
  text = text.lower()
  text = re.findall(r'\b[a-z]+\b', text)
  return ' '.join(text)

In [8]:
training_data = pd.read_csv('insults.csv')
training_data.head()

Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,"""You fuck your dad."""
1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ..."
2,0,,"""A\\xc2\\xa0majority of Canadians can and has ..."
3,0,,"""listen if you dont wanna get married to a man..."
4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd..."


In [9]:
training_data['cleaned_comment'] = training_data['Comment'].map(clean_text)
training_data.head()

Unnamed: 0,Insult,Date,Comment,cleaned_comment
0,1,20120618192155Z,"""You fuck your dad.""",you fuck your dad
1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ...",i really don t understand your point it seems ...
2,0,,"""A\\xc2\\xa0majority of Canadians can and has ...",a of canadians can and has been wrong before n...
3,0,,"""listen if you dont wanna get married to a man...",listen if you dont wanna get married to a man ...
4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd...",c b xu bi t xecnh c ho kh nc ng d ng cu xed ch...


**Make some festures**


---



In [34]:
count_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 3), stop_words='english', max_features=50000)
count_vectorizer.fit(training_data['cleaned_comment'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=50000, min_df=1,
                ngram_range=(1, 3), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [35]:
X = count_vectorizer.transform(training_data['cleaned_comment'])
y = training_data['Insult']

**Cross-Validate**


---



*Split-data*

In [None]:
mask = [bool(np.random.binomial(1, .75)) for _ in range(X.shape[0])]
mask

In [37]:
X[np.array(mask)]
sum(mask)

2945

In [38]:
y[mask].shape[0]

2945

In [None]:
y[~mask].shape

In [21]:
def split_data(X, y, p= 0.75):
  mask = np.array([bool(np.random.binomial(1, .75)) for _ in range(X.shape[0])])

  X_train = X[mask]
  y_train = y[mask]
  X_validation = X[~mask]
  y_validation = y[~mask]

  return X_train, y_train, X_validation, y_validation

In [28]:
X_train, y_train, X_validation, y_validation = split_data(X, y)

*Fit a model on training data*

In [29]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

*Validate model on validation data*

In [30]:
predictions = model.predict(X_validation)
baseline_pred = np.zeros(predictions.shape[0])
validation_score = accuracy_score(y_validation, predictions)

print('Validation Score: ', validation_score)


Validation Score:  0.8189823874755382


In [31]:
baseline_validation_score = accuracy_score(y_validation, baseline_pred)
print('Validation score: ', baseline_validation_score)

Validation score:  0.7318982387475538


**Everything is a hyper-parameter**


---



In [44]:
class PredictionPipeline:

  def __init__(self, ngram_range, vectorizer_class, model_class, training_data):
    self.ngram_range = ngram_range
    self.vectorizer_class = vectorizer_class
    self.model_class = model_class
    self.training_data = training_data
    self.vectorizer = None
    self.X = None
    self.y = None
    self.model = None
    self.validation_score = None

  def run(self):
    self._fit_vectorizer()
    self._featurize_text()
    self._split_train_and_validation_sets()
    self._fit_model_on_training_data()
    self._validate_model_on_validation_set()

    print(
            """
            Vectorizer Class: {vectorizer_class}\n\
            N-gram Range: {ngram_range}\n\
            Model Class: {model_class}\n\
            Validation Score: {validation_score}
            """.format(

            vectorizer_class=repr(self.vectorizer_class.__name__), 
            ngram_range=self.ngram_range, 
            model_class=repr(self.model_class.__name__), 
            validation_score=round(self.validation_score, 4)

            )
            )
  def _fit_vectorizer(self):
    self.vectorizer = vectorizer_class(analyzer='word', ngram_range=ngram_range, 
                                     stop_words='english', max_features=50000)
    self.vectorizer.fit(self.training_data['cleaned_comment'])

  def _featurize_text(self):
    self.X = self.vectorizer.transform(self.training_data['cleaned_comment'])
    self.y = self.training_data['Insult']

  def _split_train_and_validation_sets(self):
    self.X_train, self.y_train, self.X_validation, self.y_validation = split_data(
            self.X, self.y)
    
  def _fit_model_on_training_data(self):
    self.model = self.model_class()
    self.model.fit(self.X_train, self.y_train)

  def _validate_model_on_validation_set(self):
    predictions = self.model.predict(self.X_validation)
    self.validation_score = accuracy_score(self.y_validation, predictions)

In [45]:
results = {}

for ngram_range in [(1, 1), (1, 2), (1, 3), (1, 4)]:
  for vectorizer_class in [CountVectorizer, TfidfVectorizer]:
    for model_class in [LogisticRegression, LinearSVC, RandomForestClassifier]:

      # run prediction pipeline
      prediction_pipeline = PredictionPipeline(
          ngram_range=ngram_range,
          vectorizer_class=vectorizer_class,
          model_class=model_class,
          training_data=training_data
            )
      
      prediction_pipeline.run()

      # add hyper-parameters to `results` dictionary
      results[str(prediction_pipeline.validation_score)] = {
              'vectorizer_class': prediction_pipeline.vectorizer_class,
              'ngram_range': prediction_pipeline.ngram_range,
              'model_class': prediction_pipeline.model_class
            }



            Vectorizer Class: 'CountVectorizer'
            N-gram Range: (1, 1)
            Model Class: 'LogisticRegression'
            Validation Score: 0.8374
            

            Vectorizer Class: 'CountVectorizer'
            N-gram Range: (1, 1)
            Model Class: 'LinearSVC'
            Validation Score: 0.8241
            

            Vectorizer Class: 'CountVectorizer'
            N-gram Range: (1, 1)
            Model Class: 'RandomForestClassifier'
            Validation Score: 0.813
            

            Vectorizer Class: 'TfidfVectorizer'
            N-gram Range: (1, 1)
            Model Class: 'LogisticRegression'
            Validation Score: 0.7889
            

            Vectorizer Class: 'TfidfVectorizer'
            N-gram Range: (1, 1)
            Model Class: 'LinearSVC'
            Validation Score: 0.8326
            

            Vectorizer Class: 'TfidfVectorizer'
            N-gram Range: (1, 1)
            Model Class: 'RandomForestClass

In [46]:
top_3_scores = sorted(results.keys(), reverse=True)[:3]

for score in top_3_scores:
  print('Score: {score}\nParameters: {parameters}\n'.format(
        score=score, parameters=results[score]))

Score: 0.8373812038014784
Parameters: {'vectorizer_class': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'ngram_range': (1, 1), 'model_class': <class 'sklearn.linear_model._logistic.LogisticRegression'>}

Score: 0.8326403326403327
Parameters: {'vectorizer_class': <class 'sklearn.feature_extraction.text.TfidfVectorizer'>, 'ngram_range': (1, 1), 'model_class': <class 'sklearn.svm._classes.LinearSVC'>}

Score: 0.8316532258064516
Parameters: {'vectorizer_class': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'ngram_range': (1, 2), 'model_class': <class 'sklearn.ensemble._forest.RandomForestClassifier'>}



**Train final model**

---



In [47]:
top_score_key = top_3_scores[0]

In [48]:
vectorizer_class = results[top_score_key]['vectorizer_class']
ngram_range = results[top_score_key]['ngram_range']
model_class = results[top_score_key]['model_class']

# fit vectorizer
vectorizer = vectorizer_class(analyzer='word', ngram_range=ngram_range, stop_words='english', max_features=50000)
vectorizer.fit(training_data['cleaned_comment'])

# transform text
X = vectorizer.transform(training_data['cleaned_comment'])
y = training_data['Insult']

# fit model on training data
model = model_class()
model.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

**Run it live**

---


In [None]:
while True:
    input_string = input('Please enter a string: ')
    input_string = clean_text(input_string)
    x_test = vectorizer.transform([input_string])
    
    prediction = model.predict(x_test)[0]
    print('Insult?: {}'.format( bool(prediction)))