In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
df = pd.read_csv('sarcasm-detector/data/train-balanced-sarcasm.csv')
df = df.drop(['author','date','created_utc','parent_comment','subreddit','downs','ups','score'],axis=1)
df = df.dropna()
df

Unnamed: 0,label,comment
0,0,NC and NH.
1,0,You do know west teams play against west teams...
2,0,"They were underdogs earlier today, but since G..."
3,0,"This meme isn't funny none of the ""new york ni..."
4,0,I could use one of those tools.
...,...,...
1010821,1,I'm sure that Iran and N. Korea have the techn...
1010822,1,"whatever you do, don't vote green!"
1010823,1,Perhaps this is an atheist conspiracy to make ...
1010824,1,The Slavs got their own country - it is called...


Гипотезы:
1) Не удалять скобки, двоеточия, знаки вопроса (для этого сплитать по твитам)
2) Проверть caps - если все слово написано капсом, то его не переводим в нижний регистр (проверка с помощью метода isupper())
3) Исправить сплит строк, сгенерировав новый вид символа - много знаков вопросов, много восклицательных знаков (многоточие уже есть)

In [5]:
import re
import string
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

def preprocessing(text):
    text = re.sub('[?!]+',' featuremark ',text)
    text = re.sub('\.{2,}',' featuredot ',text)
    text = re.sub('[0-9]+','',text)

    words = TweetTokenizer().tokenize(text)
    punct = list(string.punctuation)

    words = [word.lower() if not word.isupper() else word for word in words ]
    custom_sw = ["'s","``","'m","'d","'re","--", "(",")","'d",""," ","n't","'t","'"]
    sw = set(list(stopwords.words('english')) + punct + custom_sw)
    words = [word for word in words if word not in sw]

    return  words

def create_prep_dataframe(series):
  df = series.to_frame()
  df['prep'] = df.comment.apply(preprocessing)
  df = df.reset_index(drop=True)
  return df

series_train, series_test, y_train, y_test = train_test_split(df['comment'],df.label,test_size= 0.25, random_state = 42)
df_train = create_prep_dataframe(series_train)
df_test = create_prep_dataframe(series_test)

df_train

Unnamed: 0,comment,prep
0,"Not drafting Reed, Olsen, or Gronk at where th...","[drafting, reed, olsen, gronk, they'll, likely..."
1,Don't you just love the FPTP.,"[love, FPTP]"
2,"The question is why Chief Delphi, TBA, and the...","[question, chief, delphi, TBA, subreddit, gone..."
3,"easier gold farm is warrior Bolster, Target Du...","[easier, gold, farm, warrior, bolster, target,..."
4,What a cuck!,"[cuck, featuremark]"
...,...,...
758074,Pirate it!,"[pirate, featuremark]"
758075,My state does the on call judges too.,"[state, call, judges]"
758076,Temporal displacement,"[temporal, displacement]"
758077,God it's like people who idolize Tony Montana ...,"[god, like, people, idolize, tony, montana, wa..."


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1,3))

df_train['prep_str'] = df_train.prep.apply(lambda x: ' '.join(x))
df_test['prep_str'] = df_test.prep.apply(lambda x: ' '.join(x))

X_train = tfidf.fit_transform(df_train.prep_str)
X_test = tfidf.transform(df_test.prep_str)

y_train_cleaned, y_test_cleaned = y_train, y_test

In [7]:
X_train.shape

(758079, 5059349)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score

params = {'tol':[1e-2,1e-2,1e-3,1e-4,1e-5,1e-6],
         'C':np.arange(1,5,0.5),
         'random_state':[42],
         'n_jobs':[-1],
         'max_iter':[300],
         'solver':['sag','saga']}


# best params: {'tol': 1e-06, 'solver': 'sag', 'random_state': 42, 'n_jobs': -1, 'max_iter': 300, 'C': 2.0}
model = LogisticRegression(tol=1e-6,solver='sag',random_state=42,n_jobs=-1,max_iter=300,C=2.0)
model.fit(X_train,y_train_cleaned)
y_pred = model.predict(X_test)

print(f'f1 macro: {f1_score(y_pred,y_test_cleaned,average="macro")}')
print(f'f1 micro: {f1_score(y_pred,y_test_cleaned,average="micro")}')
print(f'f1 weighted: {f1_score(y_pred,y_test_cleaned,average="weighted")}')

f1 macro: 0.6939471664654866
f1 micro: 0.6939658242775848
f1 weighted: 0.6939787892346989


In [9]:
from joblib import dump

dump(model, 'sarcasm-detector/models/LogReg.joblib')

['sarcasm-detector/models/LogReg.joblib']

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

dict = pd.read_csv('data/dict.csv')
tfidf = TfidfVectorizer(ngram_range=(1,3))
tfidf.fit(dict.prep_str.values.astype('U'))

In [5]:
import joblib

joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [4]:
import onnx
from joblib import load
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import StringTensorType

tfidf = load('models/tfidf_vectorizer.pkl')
logreg = load('models/LogReg.joblib')

model = Pipeline([('tfidf',tfidf),('clf',logreg)])

onnx_model = convert_sklearn(model,'onnx_log_reg',[('input',StringTensorType([None,1]))])
onnx.save(onnx_model,'log_reg.onnx')