### Entrenamiento de modelo con Sklearn

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer

### Obtención de dataset

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/twitter-data/master/twitt30k.csv')
df

Unnamed: 0,twitts,sentiment
0,@robbiebronniman Sounds like a great night.,1
1,Damn the person who stolde my wallet !!!!! Ma...,1
2,Greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars Pretty pretty pretty please, pak...",0
...,...,...
29995,@Calumfan1 is it in any way related to photosh...,0
29996,@Swiz_NZ really? wow thats crap,0
29997,"At the 2010 lexus HS250h press event. Again, ...",0
29998,@karmicunderpath ooooh now there's a nice thou...,1


### Preprocesameinto de datos

In [13]:
def text_prep(text:str):
    from nltk.corpus import stopwords
    stopwords = stopwords.words('english')
    
    tokens = [] 
    text = re.sub('https\S+', '', text)
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
        "]+", flags = re.UNICODE)

    text = regrex_pattern.sub(r'',text)
    #text = re.sub('Replying to \n@\S+', '', text)
    text = re.sub('RT @\S+', '', text)
    text = re.sub('@\S+', '', text)
    text = re.sub('#\S+', '', text)

    for w in word_tokenize(text):
        w = w.lower()
        if ((re.search('[a-zA-Z]', w)) and (w not in stopwords)): tokens.append(w)
    return ' '.join(tokens)

In [15]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk import FreqDist
import re

[nltk_data] Downloading package punkt to /home/jh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
df['text_prep'] = df.twitts.apply(text_prep)

In [22]:
df.tail(10)

Unnamed: 0,twitts,sentiment,text_prep
29990,@DavidArchie yeah! probably its because of tho...,1,yeah probably cats hate cats especially start ...
29991,you go girl!!! write like your heart depends o...,1,go girl write like heart depends flippin fanta...
29992,I'm recieving alot of shocking news one shot ....,0,'m recieving alot shocking news one shot http ...
29993,Sometimes I even fascinate myself... As I stum...,1,sometimes even fascinate stumle happily home a...
29994,On my way to jons,1,way jons
29995,@Calumfan1 is it in any way related to photosh...,0,way related photoshop
29996,@Swiz_NZ really? wow thats crap,0,really wow thats crap
29997,"At the 2010 lexus HS250h press event. Again, ...",0,lexus hs250h press event ca n't tell anything ...
29998,@karmicunderpath ooooh now there's a nice thou...,1,ooooh 's nice thought
29999,@mariap91 i'd usually ask you about the sun an...,1,'d usually ask sun school since write words of...


### Entrenamiento con svm

In [17]:
def run_svm(df):
    X = df['text_prep']
    y = df['sentiment']

    tfidf = TfidfVectorizer()
    X = tfidf.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

    print('shape of X: ', X.shape)

    clf = LinearSVC()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print()
    print('Printing Report')
    print(classification_report(y_test, y_pred))
    
    return tfidf, clf

In [18]:
%%time
tfidf, clf = run_svm(df)

shape of X:  (30000, 27184)

Printing Report
              precision    recall  f1-score   support

           0       0.73      0.72      0.73      3000
           1       0.73      0.74      0.73      3000

    accuracy                           0.73      6000
   macro avg       0.73      0.73      0.73      6000
weighted avg       0.73      0.73      0.73      6000

CPU times: user 1.19 s, sys: 24.2 ms, total: 1.21 s
Wall time: 1.28 s


## Test

In [20]:
x = ['i am really happy. thanks a lot for coming with me']

In [21]:
clf.predict(tfidf.transform(x))

array([1])

## Guardamos el modelo

In [23]:
import pickle

pickle.dump(clf, open('clf.pkl', 'wb'))
pickle.dump(tfidf, open('tfidf.pkl', 'wb'))