In [1]:
import pandas as pd
from IPython.core.display import display, HTML
import numpy as np
import nltk
import ssl
import glob, string
from nltk.stem import PorterStemmer, WordNetLemmatizer
from pymorphy2 import MorphAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from autocorrect import Speller
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, make_scorer, f1_score, classification_report,precision_recall_fscore_support

from gensim.models.word2vec import Word2Vec
from collections import Counter
from tqdm.notebook import tqdm
import warnings
from tqdm.auto import tqdm

warnings.filterwarnings("ignore")
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
display(HTML('<style>.container { width:95% !important; }</style>'))
tqdm.pandas()

### Read Data

In [2]:
PATH = 'data/*'
def read_datasets(path):
    files_list = glob.glob(path)
    df_pos, df_neg, df_neu = (pd.DataFrame() for i in range(3))
    for file in files_list:
        if 'pos' in file.lower():
            df_pos = pd.read_csv(file, header=None).T
            df_pos['sentiment'] = 'positive'
        if 'neg' in file.lower():
            df_neg = pd.read_csv(file, header=None).T
            df_neg['sentiment'] = 'negative'
        if 'neu' in file.lower():
            df_neu = pd.read_csv(file, header=None).T
            df_neu['sentiment'] = 'neutral'
    return pd.concat([df.rename(columns = {0:'text'}) for df in (df_pos, df_neg, df_neu)], ignore_index=True)

    
df = read_datasets(PATH)

### Prepare Data

In [3]:
df.isna().sum()

text         5
sentiment    0
dtype: int64

In [4]:
df.dropna(subset=['text'], inplace=True)
df.reset_index(drop=True)
df.isna().sum()

text         0
sentiment    0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3868 entries, 0 to 3872
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       3868 non-null   object
 1   sentiment  3868 non-null   object
dtypes: object(2)
memory usage: 90.7+ KB


In [6]:
df.text = df.text.astype(str)
df.text = df.text.str.lower()
df.sentiment = df.sentiment.astype('category')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3868 entries, 0 to 3872
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   text       3868 non-null   object  
 1   sentiment  3868 non-null   category
dtypes: category(1), object(1)
memory usage: 64.3+ KB


In [8]:
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /usr/local/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
# tokenization
df['tokens'] = df.text.progress_apply(nltk.word_tokenize)
# stemming
stemmer = MorphAnalyzer()
df['stems'] = df.tokens.progress_apply(lambda tokens: [stemmer.parse(token)[0].normal_form for token in tokens])
# lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmas'] = df.tokens.progress_apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])
# misspeling
s = Speller()
df['tokens_misspellings'] = df.tokens.progress_apply(lambda tokens: [s.autocorrect_word(token) for token in tokens])
df['lemmas_misspellings'] = df.lemmas.progress_apply(lambda lemmas: [s.autocorrect_word(lemma) for lemma in lemmas])

  0%|          | 0/3868 [00:00<?, ?it/s]

  0%|          | 0/3868 [00:00<?, ?it/s]

  0%|          | 0/3868 [00:00<?, ?it/s]

  0%|          | 0/3868 [00:00<?, ?it/s]

  0%|          | 0/3868 [00:00<?, ?it/s]

In [10]:
punct = string.punctuation 

In [11]:
words_list = [word for sentense in df.tokens for word in sentense if word not in punct]
dictinary = Counter(words_list)

In [12]:
stop_words_custom = sorted(list(set(words_list)))[325:]

In [13]:
stop_words = set(stopwords.words('english') + stop_words_custom)

In [14]:
df['tokens_cleaned'] = df.tokens.apply(lambda tokens: [token for token in tokens \
                                       if token.isnumeric() != True \
                                    and token in stop_words])

In [15]:
df['tokens_cleaned'] = df['tokens_cleaned'].apply(lambda x: np.nan if x == [] else x)

In [16]:
df = df.dropna(subset=['tokens_cleaned'])

In [17]:
# for cleaned stemming
stemmer = MorphAnalyzer()
df['stems_cleaned'] = df.tokens_cleaned.apply(lambda tokens:
                                              [stemmer.parse(token)[0].normal_form for token in tokens])
# for cleaned lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmas_cleaned'] = df.tokens_cleaned.apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])
df.head()

Unnamed: 0,text,sentiment,tokens,stems,lemmas,tokens_misspellings,lemmas_misspellings,tokens_cleaned,stems_cleaned,lemmas_cleaned
0,an inspiration in all aspects: fashion,positive,"[an, inspiration, in, all, aspects, :, fashion]","[an, inspiration, in, all, aspects, :, fashion]","[an, inspiration, in, all, aspect, :, fashion]","[an, inspiration, in, all, aspects, a, fashion]","[an, inspiration, in, all, aspect, a, fashion]","[an, inspiration, in, all, aspects, fashion]","[an, inspiration, in, all, aspects, fashion]","[an, inspiration, in, all, aspect, fashion]"
1,fitness,positive,[fitness],[fitness],[fitness],[fitness],[fitness],[fitness],[fitness],[fitness]
2,beauty and personality. :)kisses thefashionicon,positive,"[beauty, and, personality, ., :, ), kisses, thefashionicon]","[beauty, and, personality, ., :, ), kisses, thefashionicon]","[beauty, and, personality, ., :, ), kiss, thefashionicon]","[beauty, and, personality, a, a, a, kisses, thefashionicon]","[beauty, and, personality, a, a, a, kiss, thefashionicon]","[beauty, and, personality, kisses, thefashionicon]","[beauty, and, personality, kisses, thefashionicon]","[beauty, and, personality, kiss, thefashionicon]"
3,apka apna awam ka channel frankline tv aam admi production please visit or likes share :)fb page :...,positive,"[apka, apna, awam, ka, channel, frankline, tv, aam, admi, production, please, visit, or, likes, share, :, ), fb, page, :, ...]","[apka, apna, awam, ka, channel, frankline, tv, aam, admi, production, please, visit, or, likes, share, :, ), fb, page, :, ...]","[apka, apna, awam, ka, channel, frankline, tv, aam, admi, production, please, visit, or, like, share, :, ), fb, page, :, ...]","[aka, ana, away, ka, channel, franklin, tv, aam, admin, production, please, visit, or, likes, share, a, a, fb, page, a, ...]","[aka, ana, away, ka, channel, franklin, tv, aam, admin, production, please, visit, or, like, share, a, a, fb, page, a, ...]","[apka, apna, awam, ka, channel, frankline, tv, aam, admi, production, please, visit, or, likes, share, fb, page]","[apka, apna, awam, ka, channel, frankline, tv, aam, admi, production, please, visit, or, likes, share, fb, page]","[apka, apna, awam, ka, channel, frankline, tv, aam, admi, production, please, visit, or, like, share, fb, page]"
4,beautiful album from the greatest unsung guitar genius of our time - and i've met the great backstage,positive,"[beautiful, album, from, the, greatest, unsung, guitar, genius, of, our, time, -, and, i, 've, met, the, great, backstage]","[beautiful, album, from, the, greatest, unsung, guitar, genius, of, our, time, -, and, i, 've, met, the, great, backstage]","[beautiful, album, from, the, greatest, unsung, guitar, genius, of, our, time, -, and, i, 've, met, the, great, backstage]","[beautiful, album, from, the, greatest, unsung, guitar, genius, of, our, time, a, and, i, ve, met, the, great, backstage]","[beautiful, album, from, the, greatest, unsung, guitar, genius, of, our, time, a, and, i, ve, met, the, great, backstage]","[beautiful, album, from, the, greatest, unsung, guitar, genius, of, our, time, and, i, met, the, great, backstage]","[beautiful, album, from, the, greatest, unsung, guitar, genius, of, our, time, and, i, met, the, great, backstage]","[beautiful, album, from, the, greatest, unsung, guitar, genius, of, our, time, and, i, met, the, great, backstage]"


### Modeling

In [18]:
df_ = df.copy()

In [19]:
X = df_.drop(columns=['sentiment'])
y = df_.sentiment

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=21, test_size=0.2)

In [20]:
y_train.value_counts(normalize=True), y_test.value_counts(normalize=True)

(neutral     0.404854
 positive    0.306149
 negative    0.288997
 Name: sentiment, dtype: float64,
 neutral     0.404916
 positive    0.306598
 negative    0.288486
 Name: sentiment, dtype: float64)

### defaul pipeline: logreg@accuracy, gridsearch@5 folds

In [21]:
custom_scorer = make_scorer(precision_score, greater_is_better=True,  pos_label=0)

In [22]:
labels = list(df_.sentiment.unique())
def calc_scores(method, y_predict, y_test, lbls, params, acc_score):
    params_list = [method]
    columns_ = ['method']
    for average in ('micro', 'macro', 'weighted'):
        scores = precision_recall_fscore_support(y_predict,
                                        y_test,
                                        labels=lbls,
                                        average=average,
                                        beta=0.5)
        columns_.extend([ f'precision_{average}',
                           f'recall_{average}',
                           f'f05_{average}'])
        scores_ = scores[0], scores[1], scores[2], _
        params_list.extend(list(scores_)[:-1])
    params_list.append(params)
    params_list.append(acc_score)
    columns_.append('params')
    columns_.append('acc_score')
    return form_df(params_list, columns_)

def form_df(data, names):
    cols = pd.DataFrame(data).T.columns
    df = pd.DataFrame(data).T.rename(
    columns={k:v for k,v in zip(cols,names)})
    return df

In [23]:
def get_model(method, pipeline, parameters):
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
    grid_search.fit(X_train[method], y_train)
    y_predict = grid_search.predict(X_test[method])
    acc_score = grid_search.score(X_test[method], y_test)
    scores = calc_scores(method, y_predict, y_test, labels, grid_search.best_params_, acc_score)
    return scores

In [24]:
def dummy(doc):
    return doc

pipeline = Pipeline([
    ('vect', CountVectorizer(
                tokenizer=dummy,
                preprocessor=dummy,
                binary=True,
                max_df=1.0,
                )  ),
    ('logreg', LogisticRegression(max_iter=1500, random_state=21)),
])

parameters = {
    'vect__min_df': (1, 2, 4, 6, 10, 15),
    'vect__max_features': (range(1500, 5500, 1000)),
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
}

In [25]:
result = []

In [26]:
%%time
methods = ['text', 'tokens', 'stems', \
           'lemmas', 'tokens_misspellings',  'lemmas_misspellings', \
           'tokens_cleaned', 'stems_cleaned', 'lemmas_cleaned']    
# p = Process(target=get_model, args=(method,))
# with Pool(9) as p:
#     r = list(tqdm(p.imap(get_model, methods), total=len(methods)))
r = [get_model(method, pipeline, parameters) for method in tqdm(methods)]
df = pd.concat(r).set_index('method')
result.append(df)

  0%|          | 0/9 [00:00<?, ?it/s]

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
CPU times: user 50.3 s, sys: 4.71 s, total: 55 s
Wall time: 1min 57s


In [27]:
df

Unnamed: 0_level_0,precision_micro,recall_micro,f05_micro,precision_macro,recall_macro,f05_macro,precision_weighted,recall_weighted,f05_weighted,params,acc_score
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
text,0.913325,0.913325,0.913325,0.907062,0.915911,0.908204,0.916451,0.913325,0.915166,"{'vect__max_features': 3500, 'vect__min_df': 6, 'vect__ngram_range': (1, 3)}",0.913325
tokens,0.917206,0.917206,0.917206,0.909486,0.924034,0.911203,0.922451,0.917206,0.920112,"{'vect__max_features': 4500, 'vect__min_df': 1, 'vect__ngram_range': (1, 1)}",0.917206
stems,0.917206,0.917206,0.917206,0.909486,0.924034,0.911203,0.922451,0.917206,0.920112,"{'vect__max_features': 4500, 'vect__min_df': 1, 'vect__ngram_range': (1, 1)}",0.917206
lemmas,0.912031,0.912031,0.912031,0.904973,0.916584,0.906409,0.916361,0.912031,0.914541,"{'vect__max_features': 2500, 'vect__min_df': 1, 'vect__ngram_range': (1, 1)}",0.912031
tokens_misspellings,0.906856,0.906856,0.906856,0.899512,0.91211,0.901034,0.911566,0.906856,0.909549,"{'vect__max_features': 3500, 'vect__min_df': 1, 'vect__ngram_range': (1, 1)}",0.906856
lemmas_misspellings,0.902975,0.902975,0.902975,0.895546,0.906918,0.896956,0.907399,0.902975,0.905581,"{'vect__max_features': 2500, 'vect__min_df': 1, 'vect__ngram_range': (1, 1)}",0.902975
tokens_cleaned,0.906856,0.906856,0.906856,0.8996,0.911512,0.901015,0.911572,0.906856,0.909595,"{'vect__max_features': 3500, 'vect__min_df': 1, 'vect__ngram_range': (1, 1)}",0.906856
stems_cleaned,0.906856,0.906856,0.906856,0.8996,0.911512,0.901015,0.911572,0.906856,0.909595,"{'vect__max_features': 3500, 'vect__min_df': 1, 'vect__ngram_range': (1, 1)}",0.906856
lemmas_cleaned,0.902975,0.902975,0.902975,0.895546,0.906989,0.896981,0.907349,0.902975,0.905549,"{'vect__max_features': 2500, 'vect__min_df': 1, 'vect__ngram_range': (1, 1)}",0.902975


### word count

In [28]:
def dummy(doc):
    return doc

pipeline = Pipeline([
    ('vect', CountVectorizer(
                tokenizer=dummy,
                preprocessor=dummy,
                binary=False,
                )  ),
    ('logreg', LogisticRegression(max_iter=1500, random_state=21)),
])

parameters = {
    'vect__min_df': (1, 2, 4, 6, 10, 15),
    'vect__max_features': (range(1500, 5500, 1000)),
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'vect__binary': (False, )
}

In [29]:
%%time
methods = ['text', 'tokens', 'stems', \
           'lemmas', 'tokens_misspellings',  'lemmas_misspellings', \
           'tokens_cleaned', 'stems_cleaned', 'lemmas_cleaned']    
# p = Process(target=get_model, args=(method,))
# with Pool(9) as p:
#     r = list(tqdm(p.imap(get_model, methods), total=len(methods)))
r = [get_model(method, pipeline, parameters) for method in tqdm(methods)]
df = pd.concat(r).set_index('method')
result.append(df)

  0%|          | 0/9 [00:00<?, ?it/s]

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
CPU times: user 1min 17s, sys: 9.66 s, total: 1min 27s
Wall time: 3min


In [30]:
df

Unnamed: 0_level_0,precision_micro,recall_micro,f05_micro,precision_macro,recall_macro,f05_macro,precision_weighted,recall_weighted,f05_weighted,params,acc_score
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
text,0.899094,0.899094,0.899094,0.893894,0.900012,0.894802,0.901058,0.899094,0.900327,"{'vect__binary': False, 'vect__max_features': 3500, 'vect__min_df': 4, 'vect__ngram_range': (1, 3)}",0.899094
tokens,0.913325,0.913325,0.913325,0.905785,0.920012,0.907319,0.919125,0.913325,0.916587,"{'vect__binary': False, 'vect__max_features': 1500, 'vect__min_df': 6, 'vect__ngram_range': (1, 2)}",0.913325
stems,0.913325,0.913325,0.913325,0.905785,0.920012,0.907319,0.919125,0.913325,0.916587,"{'vect__binary': False, 'vect__max_features': 1500, 'vect__min_df': 6, 'vect__ngram_range': (1, 2)}",0.913325
lemmas,0.914618,0.914618,0.914618,0.907533,0.919673,0.908959,0.919309,0.914618,0.917308,"{'vect__binary': False, 'vect__max_features': 1500, 'vect__min_df': 4, 'vect__ngram_range': (1, 2)}",0.914618
tokens_misspellings,0.90815,0.90815,0.90815,0.900147,0.914889,0.901822,0.913835,0.90815,0.911327,"{'vect__binary': False, 'vect__max_features': 3500, 'vect__min_df': 1, 'vect__ngram_range': (1, 3)}",0.90815
lemmas_misspellings,0.901682,0.901682,0.901682,0.894139,0.90768,0.895733,0.906839,0.901682,0.904605,"{'vect__binary': False, 'vect__max_features': 3500, 'vect__min_df': 1, 'vect__ngram_range': (1, 1)}",0.901682
tokens_cleaned,0.904269,0.904269,0.904269,0.896269,0.910804,0.89791,0.910067,0.904269,0.907544,"{'vect__binary': False, 'vect__max_features': 4500, 'vect__min_df': 1, 'vect__ngram_range': (1, 1)}",0.904269
stems_cleaned,0.904269,0.904269,0.904269,0.896269,0.910804,0.89791,0.910067,0.904269,0.907544,"{'vect__binary': False, 'vect__max_features': 4500, 'vect__min_df': 1, 'vect__ngram_range': (1, 1)}",0.904269
lemmas_cleaned,0.901682,0.901682,0.901682,0.894139,0.906485,0.895617,0.906553,0.901682,0.904509,"{'vect__binary': False, 'vect__max_features': 4500, 'vect__min_df': 1, 'vect__ngram_range': (1, 1)}",0.901682


### TFIDF

In [31]:
def dummy(doc):
    return doc

pipeline = Pipeline([
    ('vect', CountVectorizer(
                tokenizer=dummy,
                preprocessor=dummy,
                )  ),
    ('tfidf', TfidfTransformer()),
    ('logreg', LogisticRegression(max_iter=1500, random_state=21)),
])

parameters = {
    'vect__min_df': (1, 2, 4, 6, 10, 15),
#     'vect__max_features': (range(1500, 5500, 1000)),
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'vect__binary': (True, False),
    'tfidf__norm': ('l1', 'l2'),
}

In [32]:
%%time
methods = ['text', 'tokens', 'stems', \
           'lemmas', 'tokens_misspellings',  'lemmas_misspellings', \
           'tokens_cleaned', 'stems_cleaned', 'lemmas_cleaned']    
# p = Process(target=get_model, args=(method,))
# with Pool(9) as p:
#     r = list(tqdm(p.imap(get_model, methods), total=len(methods)))
r = [get_model(method, pipeline, parameters) for method in tqdm(methods)]
df = pd.concat(r).set_index('method')
result.append(df)

  0%|          | 0/9 [00:00<?, ?it/s]

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
CPU times: user 1min 12s, sys: 5.1 s, total: 1min 17s
Wall time: 2min 41s


In [33]:
df

Unnamed: 0_level_0,precision_micro,recall_micro,f05_micro,precision_macro,recall_macro,f05_macro,precision_weighted,recall_weighted,f05_weighted,params,acc_score
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
text,0.904269,0.904269,0.904269,0.89541,0.915062,0.897522,0.911652,0.904269,0.908196,"{'tfidf__norm': 'l2', 'vect__binary': True, 'vect__min_df': 15, 'vect__ngram_range': (1, 3)}",0.904269
tokens,0.917206,0.917206,0.917206,0.909498,0.921075,0.910541,0.923096,0.917206,0.920668,"{'tfidf__norm': 'l2', 'vect__binary': True, 'vect__min_df': 1, 'vect__ngram_range': (1, 2)}",0.917206
stems,0.917206,0.917206,0.917206,0.909498,0.921075,0.910541,0.923096,0.917206,0.920668,"{'tfidf__norm': 'l2', 'vect__binary': True, 'vect__min_df': 1, 'vect__ngram_range': (1, 2)}",0.917206
lemmas,0.915912,0.915912,0.915912,0.908433,0.920433,0.90959,0.921689,0.915912,0.919288,"{'tfidf__norm': 'l2', 'vect__binary': True, 'vect__min_df': 1, 'vect__ngram_range': (1, 2)}",0.915912
tokens_misspellings,0.906856,0.906856,0.906856,0.8996,0.911512,0.901015,0.911572,0.906856,0.909595,"{'tfidf__norm': 'l2', 'vect__binary': True, 'vect__min_df': 6, 'vect__ngram_range': (1, 1)}",0.906856
lemmas_misspellings,0.910737,0.910737,0.910737,0.903566,0.91563,0.904952,0.915534,0.910737,0.913493,"{'tfidf__norm': 'l2', 'vect__binary': True, 'vect__min_df': 1, 'vect__ngram_range': (1, 2)}",0.910737
tokens_cleaned,0.905563,0.905563,0.905563,0.898194,0.911168,0.899725,0.910614,0.905563,0.908458,"{'tfidf__norm': 'l2', 'vect__binary': True, 'vect__min_df': 6, 'vect__ngram_range': (1, 1)}",0.905563
stems_cleaned,0.905563,0.905563,0.905563,0.898194,0.911168,0.899725,0.910614,0.905563,0.908458,"{'tfidf__norm': 'l2', 'vect__binary': True, 'vect__min_df': 6, 'vect__ngram_range': (1, 1)}",0.905563
lemmas_cleaned,0.909444,0.909444,0.909444,0.901642,0.914043,0.903121,0.914418,0.909444,0.912344,"{'tfidf__norm': 'l2', 'vect__binary': True, 'vect__min_df': 4, 'vect__ngram_range': (1, 3)}",0.909444


### TFIDF + SGD

In [34]:
def dummy(doc):
    return doc

pipeline = Pipeline([
    ('vect', CountVectorizer(
                tokenizer=dummy,
                preprocessor=dummy,
                )  ),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(max_iter=1000)),
])

parameters = {
#     'vect__min_df': (1, 2, 4, 6),
#     'vect__max_features': (range(1500, 5500, 1000)),
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'vect__binary': (True, False),
    'tfidf__norm': ('l2',),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__max_iter': (100, 150, 250, 500),
}

In [35]:
%%time
methods = ['text', 'tokens', 'stems', \
           'lemmas', 'tokens_misspellings',  'lemmas_misspellings', \
           'tokens_cleaned', 'stems_cleaned', 'lemmas_cleaned']    
# p = Process(target=get_model, args=(method,))
# with Pool(9) as p:
#     r = list(tqdm(p.imap(get_model, methods), total=len(methods)))
r = [get_model(method, pipeline, parameters) for method in tqdm(methods)]
df = pd.concat(r).set_index('method')
result.append(df)

  0%|          | 0/9 [00:00<?, ?it/s]

Fitting 5 folds for each of 96 candidates, totalling 480 fits
Fitting 5 folds for each of 96 candidates, totalling 480 fits
Fitting 5 folds for each of 96 candidates, totalling 480 fits
Fitting 5 folds for each of 96 candidates, totalling 480 fits
Fitting 5 folds for each of 96 candidates, totalling 480 fits
Fitting 5 folds for each of 96 candidates, totalling 480 fits
Fitting 5 folds for each of 96 candidates, totalling 480 fits
Fitting 5 folds for each of 96 candidates, totalling 480 fits
Fitting 5 folds for each of 96 candidates, totalling 480 fits
CPU times: user 47.6 s, sys: 1.7 s, total: 49.3 s
Wall time: 1min 55s


In [36]:
df

Unnamed: 0_level_0,precision_micro,recall_micro,f05_micro,precision_macro,recall_macro,f05_macro,precision_weighted,recall_weighted,f05_weighted,params,acc_score
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
text,0.892626,0.892626,0.892626,0.886785,0.891774,0.887094,0.896504,0.892626,0.895092,"{'clf__alpha': 1e-05, 'clf__max_iter': 100, 'clf__penalty': 'l2', 'tfidf__norm': 'l2', 'vect__binary': True, 'vect__ngram_range': (1, 3)}",0.892626
tokens,0.918499,0.918499,0.918499,0.912447,0.919058,0.912923,0.922132,0.918499,0.920633,"{'clf__alpha': 1e-05, 'clf__max_iter': 150, 'clf__penalty': 'l2', 'tfidf__norm': 'l2', 'vect__binary': True, 'vect__ngram_range': (1, 2)}",0.918499
stems,0.923674,0.923674,0.923674,0.918502,0.92382,0.919102,0.926084,0.923674,0.925155,"{'clf__alpha': 1e-05, 'clf__max_iter': 250, 'clf__penalty': 'l2', 'tfidf__norm': 'l2', 'vect__binary': True, 'vect__ngram_range': (1, 3)}",0.923674
lemmas,0.921087,0.921087,0.921087,0.916031,0.921105,0.916719,0.923232,0.921087,0.922468,"{'clf__alpha': 1e-05, 'clf__max_iter': 150, 'clf__penalty': 'l2', 'tfidf__norm': 'l2', 'vect__binary': True, 'vect__ngram_range': (1, 2)}",0.921087
tokens_misspellings,0.914618,0.914618,0.914618,0.90924,0.915599,0.910199,0.916521,0.914618,0.915803,"{'clf__alpha': 1e-05, 'clf__max_iter': 100, 'clf__penalty': 'elasticnet', 'tfidf__norm': 'l2', 'vect__binary': True, 'vect__ngram_range': (1, 2)}",0.914618
lemmas_misspellings,0.909444,0.909444,0.909444,0.905069,0.90897,0.905534,0.910934,0.909444,0.910337,"{'clf__alpha': 1e-05, 'clf__max_iter': 100, 'clf__penalty': 'l2', 'tfidf__norm': 'l2', 'vect__binary': True, 'vect__ngram_range': (1, 3)}",0.909444
tokens_cleaned,0.921087,0.921087,0.921087,0.916626,0.921351,0.917203,0.922845,0.921087,0.922139,"{'clf__alpha': 1e-05, 'clf__max_iter': 250, 'clf__penalty': 'l2', 'tfidf__norm': 'l2', 'vect__binary': True, 'vect__ngram_range': (1, 3)}",0.921087
stems_cleaned,0.904269,0.904269,0.904269,0.899696,0.902254,0.899972,0.905568,0.904269,0.905089,"{'clf__alpha': 1e-05, 'clf__max_iter': 150, 'clf__penalty': 'l2', 'tfidf__norm': 'l2', 'vect__binary': True, 'vect__ngram_range': (1, 3)}",0.904269
lemmas_cleaned,0.901682,0.901682,0.901682,0.898084,0.900114,0.898166,0.903063,0.901682,0.902496,"{'clf__alpha': 1e-05, 'clf__max_iter': 100, 'clf__penalty': 'elasticnet', 'tfidf__norm': 'l2', 'vect__binary': True, 'vect__ngram_range': (1, 2)}",0.901682


### TFIDF + DecisionTree

In [37]:
def dummy(doc):
    return doc

pipeline = Pipeline([
    ('vect', CountVectorizer(
                tokenizer=dummy,
                preprocessor=dummy,
                )  ),
    ('tfidf', TfidfTransformer()),
    ('tree', DecisionTreeClassifier()),
])

parameters = {
#     'vect__min_df': (1, 2, 3),
#     'vect__max_features': (range(1500, 5500, 1000)),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__binary': (True, False),
    'tfidf__norm': ('l2',),
    'tree__criterion': ("gini", "entropy"),
    'tree__max_depth': (None, 5, 7, 11, 15),
    'tree__max_features': (100, 150, 250, 500),
}

In [38]:
%%time
methods = ['text', 'tokens', 'stems', \
           'lemmas', 'tokens_misspellings',  'lemmas_misspellings', \
           'tokens_cleaned', 'stems_cleaned', 'lemmas_cleaned']    
# p = Process(target=get_model, args=(method,))
# with Pool(9) as p:
#     r = list(tqdm(p.imap(get_model, methods), total=len(methods)))
r = [get_model(method, pipeline, parameters) for method in tqdm(methods)]
df = pd.concat(r).set_index('method')
result.append(df)

  0%|          | 0/9 [00:00<?, ?it/s]

Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
CPU times: user 1min 8s, sys: 1.53 s, total: 1min 10s
Wall time: 2min 10s


In [39]:
df

Unnamed: 0_level_0,precision_micro,recall_micro,f05_micro,precision_macro,recall_macro,f05_macro,precision_weighted,recall_weighted,f05_weighted,params,acc_score
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
text,0.851229,0.851229,0.851229,0.844639,0.861938,0.846193,0.857433,0.851229,0.854327,"{'tfidf__norm': 'l2', 'tree__criterion': 'gini', 'tree__max_depth': 11, 'tree__max_features': 500, 'vect__binary': True, 'vect__ngram_range': (1, 2)}",0.851229
tokens,0.878396,0.878396,0.878396,0.873174,0.879436,0.87387,0.881194,0.878396,0.880092,"{'tfidf__norm': 'l2', 'tree__criterion': 'entropy', 'tree__max_depth': None, 'tree__max_features': 500, 'vect__binary': False, 'vect__ngram_range': (1, 1)}",0.878396
stems,0.886158,0.886158,0.886158,0.879905,0.88916,0.881032,0.88985,0.886158,0.888354,"{'tfidf__norm': 'l2', 'tree__criterion': 'gini', 'tree__max_depth': None, 'tree__max_features': 500, 'vect__binary': False, 'vect__ngram_range': (1, 1)}",0.886158
lemmas,0.839586,0.839586,0.839586,0.836256,0.836328,0.836241,0.840028,0.839586,0.839913,"{'tfidf__norm': 'l2', 'tree__criterion': 'gini', 'tree__max_depth': None, 'tree__max_features': 500, 'vect__binary': False, 'vect__ngram_range': (1, 1)}",0.839586
tokens_misspellings,0.822768,0.822768,0.822768,0.804959,0.834452,0.804377,0.854472,0.822768,0.842086,"{'tfidf__norm': 'l2', 'tree__criterion': 'gini', 'tree__max_depth': 15, 'tree__max_features': 500, 'vect__binary': True, 'vect__ngram_range': (1, 1)}",0.822768
lemmas_misspellings,0.856404,0.856404,0.856404,0.852579,0.855847,0.853117,0.857354,0.856404,0.85704,"{'tfidf__norm': 'l2', 'tree__criterion': 'entropy', 'tree__max_depth': None, 'tree__max_features': 500, 'vect__binary': True, 'vect__ngram_range': (1, 1)}",0.856404
tokens_cleaned,0.874515,0.874515,0.874515,0.867489,0.874986,0.868479,0.877796,0.874515,0.876594,"{'tfidf__norm': 'l2', 'tree__criterion': 'gini', 'tree__max_depth': None, 'tree__max_features': 500, 'vect__binary': False, 'vect__ngram_range': (1, 1)}",0.874515
stems_cleaned,0.86934,0.86934,0.86934,0.864771,0.866549,0.865073,0.870086,0.86934,0.86988,"{'tfidf__norm': 'l2', 'tree__criterion': 'gini', 'tree__max_depth': None, 'tree__max_features': 500, 'vect__binary': True, 'vect__ngram_range': (1, 1)}",0.86934
lemmas_cleaned,0.849935,0.849935,0.849935,0.843145,0.848968,0.843827,0.853125,0.849935,0.851997,"{'tfidf__norm': 'l2', 'tree__criterion': 'gini', 'tree__max_depth': None, 'tree__max_features': 500, 'vect__binary': False, 'vect__ngram_range': (1, 1)}",0.849935


### word2vec

In [40]:
model = Word2Vec(sentences=df_.tokens.tolist(), window=16, min_count=1, size=150, iter=100)

In [41]:
vectors_train = []
for index, row in X_train.iterrows():
#     print(row)
    model_vector = (np.mean([model.wv[token] for token in row['tokens']], axis=0)).tolist()
#     if isinstance(model_vector, float):
#         print(row)
    vectors_train.append(model_vector)

In [42]:
vectors_test = []
for index, row in X_test.iterrows():
    model_vector = (np.mean([model.wv[token] for token in row['tokens']], axis=0)).tolist()
    vectors_test.append(model_vector)

In [43]:
logres = LogisticRegression(n_jobs=-1, max_iter=1500)

In [44]:
logres.fit(vectors_train, y_train)

LogisticRegression(max_iter=1500, n_jobs=-1)

In [45]:
accuracy_score(y_test, logres.predict(vectors_test))

0.9120310478654593

### FASTTEXT

In [21]:
import fasttext

In [22]:
method = 'text'
X = df_[method]
y = df_.sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=21)
(X_train + ' __label__' + y_train.astype(str)).to_csv('train', index=False)
(X_test + ' __label__' + y_test.astype(str)).to_csv('test', index=False)
model = fasttext.train_supervised(input="train", autotuneValidationFile='test', verbose=10000, autotuneDuration=200)
acc_score = model.test("test")[1]

Trial = 1
epoch = 5
lr = 0.1
dim = 100
minCount = 1
wordNgrams = 1
minn = 0
maxn = 0
bucket = 0
dsub = 2
loss = softmax
currentScore = 0.893375
train took = 0.163794
Trial = 2
epoch = 1
lr = 0.835603
dim = 382
minCount = 1
wordNgrams = 1
minn = 0
maxn = 0
bucket = 0
dsub = 2
loss = softmax
currentScore = 0.887164
train took = 0.199815
Trial = 3
epoch = 3
lr = 0.216041
dim = 168
minCount = 1
wordNgrams = 4
minn = 0
maxn = 0
bucket = 2620593
dsub = 2
loss = softmax


Progress:   0.8% Trials:    3 Best score:  0.893375 ETA:   0h 3m18s

currentScore = 0.881988
train took = 1.39533
Trial = 4
epoch = 2
lr = 0.984649
dim = 23
minCount = 1
wordNgrams = 1
minn = 0
maxn = 0
bucket = 0
dsub = 2
loss = softmax


Progress:   1.0% Trials:    4 Best score:  0.893375 ETA:   0h 3m17s

currentScore = 0.89234
train took = 0.379419
Trial = 5
epoch = 56
lr = 0.0399362
dim = 196
minCount = 1
wordNgrams = 1
minn = 0
maxn = 0
bucket = 0
dsub = 2
loss = softmax
currentScore = 0.899586
train took = 0.250831
Trial = 6
epoch = 60
lr = 0.0212034
dim = 152
minCount = 1
wordNgrams = 5
minn = 0
maxn = 0
bucket = 287917
dsub = 2
loss = softmax


Progress:   1.3% Trials:    6 Best score:  0.899586 ETA:   0h 3m17s

currentScore = 0.895445
train took = 0.592363
Trial = 7
epoch = 100
lr = 0.01
dim = 40
minCount = 1
wordNgrams = 1
minn = 0
maxn = 0
bucket = 0
dsub = 2
loss = softmax


Progress:   1.5% Trials:    7 Best score:  0.899586 ETA:   0h 3m16s

currentScore = 0.900621
train took = 0.270432
Trial = 8
epoch = 9
lr = 0.0185386
dim = 33
minCount = 1
wordNgrams = 1
minn = 3
maxn = 6
bucket = 377970
dsub = 2
loss = softmax
currentScore = 0.796066
train took = 0.13969
Trial = 9
epoch = 24
lr = 0.01
dim = 12
minCount = 1
wordNgrams = 3
minn = 0
maxn = 0
bucket = 372933
dsub = 4
loss = softmax
currentScore = 0.812629
train took = 0.108451
Trial = 10
epoch = 100
lr = 0.01
dim = 27
minCount = 1
wordNgrams = 3
minn = 0
maxn = 0
bucket = 10000000
dsub = 2
loss = softmax


Progress:   2.0% Trials:   10 Best score:  0.900621 ETA:   0h 3m15s

currentScore = 0.899586
train took = 0.986851
Trial = 11
epoch = 33
lr = 0.01
dim = 63
minCount = 1
wordNgrams = 2
minn = 0
maxn = 0
bucket = 4521359
dsub = 2
loss = softmax


Progress:   2.5% Trials:   11 Best score:  0.900621 ETA:   0h 3m14s

currentScore = 0.874741
train took = 1.04783
Trial = 12
epoch = 100
lr = 0.059155
dim = 121
minCount = 1
wordNgrams = 1
minn = 3
maxn = 6
bucket = 3417103
dsub = 8
loss = softmax


Progress:   3.8% Trials:   12 Best score:  0.900621 ETA:   0h 3m12s

currentScore = 0.901656
train took = 2.40503
Trial = 13
epoch = 31
lr = 0.0486025
dim = 73
minCount = 1
wordNgrams = 1
minn = 0
maxn = 0
bucket = 0
dsub = 4
loss = softmax


Progress:   4.0% Trials:   13 Best score:  0.901656 ETA:   0h 3m11s

currentScore = 0.900621
train took = 0.32014
Trial = 14
epoch = 21
lr = 0.0866757
dim = 40
minCount = 1
wordNgrams = 1
minn = 3
maxn = 6
bucket = 561631
dsub = 2
loss = softmax
currentScore = 0.900621
train took = 0.189021
Trial = 15
epoch = 34
lr = 0.569388
dim = 293
minCount = 1
wordNgrams = 2
minn = 3
maxn = 6
bucket = 4367960
dsub = 2
loss = softmax


Progress:   6.3% Trials:   15 Best score:  0.901656 ETA:   0h 3m 7s

currentScore = 0.903727
train took = 4.24135
Trial = 16
epoch = 6
lr = 0.878024
dim = 105
minCount = 1
wordNgrams = 1
minn = 3
maxn = 6
bucket = 2671003
dsub = 2
loss = softmax


Progress:   7.0% Trials:   16 Best score:  0.903727 ETA:   0h 3m 5s

currentScore = 0.901656
train took = 1.39947
Trial = 17
epoch = 67
lr = 1.2809
dim = 163
minCount = 1
wordNgrams = 1
minn = 3
maxn = 6
bucket = 674349
dsub = 2
loss = softmax


Progress:   7.5% Trials:   17 Best score:  0.903727 ETA:   0h 3m 4s

currentScore = 0.902692
train took = 1.34392
Trial = 18
epoch = 100
lr = 5
dim = 169
minCount = 1
wordNgrams = 2
minn = 3
maxn = 6
bucket = 601846
dsub = 2
loss = softmax


Progress:   8.5% Trials:   18 Best score:  0.903727 ETA:   0h 3m 2s

currentScore = 0.881988
train took = 2.03619
Trial = 19
epoch = 85
lr = 0.503359
dim = 147
minCount = 1
wordNgrams = 5
minn = 0
maxn = 0
bucket = 1750121
dsub = 4
loss = softmax


Progress:   9.3% Trials:   19 Best score:  0.903727 ETA:   0h 3m 1s

currentScore = 0.902692
train took = 1.23992
Trial = 20
epoch = 100
lr = 1.31014
dim = 277
minCount = 1
wordNgrams = 1
minn = 0
maxn = 0
bucket = 0
dsub = 4
loss = softmax


Progress:   9.6% Trials:   20 Best score:  0.903727 ETA:   0h 3m 0s

currentScore = 0.89234
train took = 0.462599
Trial = 21
epoch = 12
lr = 0.367235
dim = 839
minCount = 1
wordNgrams = 4
minn = 0
maxn = 0
bucket = 3123296
dsub = 2
loss = softmax


Progress:  14.3% Trials:   21 Best score:  0.903727 ETA:   0h 2m51s

currentScore = 0.899586
train took = 9.93591
Trial = 22
epoch = 100
lr = 0.214882
dim = 367
minCount = 1
wordNgrams = 1
minn = 0
maxn = 0
bucket = 0
dsub = 2
loss = softmax


Progress:  15.3% Trials:   22 Best score:  0.903727 ETA:   0h 2m49s

currentScore = 0.897516
train took = 1.86571
Trial = 23
epoch = 55
lr = 0.560999
dim = 555
minCount = 1
wordNgrams = 3
minn = 2
maxn = 5
bucket = 6327541
dsub = 2
loss = softmax


Progress:  28.7% Trials:   23 Best score:  0.903727 ETA:   0h 2m22s

currentScore = 0.909938
train took = 26.3501
Trial = 24
epoch = 100
lr = 0.168795
dim = 639
minCount = 1
wordNgrams = 1
minn = 3
maxn = 6
bucket = 10000000
dsub = 2
loss = softmax


Progress:  64.0% Trials:   24 Best score:  0.909938 ETA:   0h 1m11s

currentScore = 0.902692
train took = 71.1544
Trial = 25
epoch = 42
lr = 0.182415
dim = 724
minCount = 1
wordNgrams = 5
minn = 3
maxn = 6
bucket = 7308332
dsub = 2
loss = softmax


Progress: 100.0% Trials:   25 Best score:  0.909938 ETA:   0h 0m 0s
Training again with best arguments


Best selected args = 0
epoch = 55
lr = 0.560999
dim = 555
minCount = 1
wordNgrams = 3
minn = 2
maxn = 5
bucket = 6327541
dsub = 2
loss = softmax


Read 0M words
Number of words:  6394
Number of labels: 3
Progress: 100.0% words/sec/thread:    7649 lr:  0.000000 avg.loss:  0.029301 ETA:   0h 0m 0s 25.4% words/sec/thread:    2269 lr:  0.418471 avg.loss:  0.106753 ETA:   0h 0m51s


In [23]:
acc_score

0.9037267080745341

### Similarity

In [27]:
result = []
for i, words1 in enumerate(tqdm(df_[method].values)):
    words1 = set(words1)
    for j, words2 in enumerate(df_[method].values):
        if j <= i:
            continue
        words2 = set(words2)
        inter = len(words1 & words2)
        outer = len(words1 | words2)
        result.append([inter/outer, i, j])
popular = list(filter(lambda x: x[0] < 1, sorted(result, key=lambda x: x[0], reverse=True)))

  0%|          | 0/3863 [00:00<?, ?it/s]

In [28]:
popular[:10]

[[0.9629629629629629, 2597, 3049],
 [0.9615384615384616, 3, 3317],
 [0.9615384615384616, 54, 174],
 [0.9615384615384616, 96, 3521],
 [0.9615384615384616, 115, 3521],
 [0.9615384615384616, 174, 972],
 [0.9615384615384616, 828, 874],
 [0.9615384615384616, 828, 1688],
 [0.9615384615384616, 828, 1832],
 [0.9615384615384616, 944, 1688]]

In [29]:
df = set()
for _, i, j in popular:
    df.add(tuple(df_['text'][[i, j]].tolist()))
    if len(df) >= 10:
        break
        
df = pd.DataFrame(df, columns=['tweet1', 'tweet2'])
df

Unnamed: 0,tweet1,tweet2
0,apka apna awam ka channel frankline tv aam admi production please visit or likes share :)fb page :...,controlling and more. also in epaper.
1,how sword-wielding processions left gaping.,gurgaon and bengal.
2,we will be never ready actually happy idhuellam dialogue ku dhan othu varum..,just please look at how scared and full of tence lauren's face is after tryin reaching camila's hand
3,never give up on the things that make you smile happy,left few months in hinhua ler unhappy i need to gambateh wink
4,never give up on the things that make you smile happy,just please look at how scared and full of tence lauren's face is after tryin reaching camila's hand
5,we will be never ready actually happy idhuellam dialogue ku dhan othu varum..,that would be a great trick happy
6,we will be never ready actually happy idhuellam dialogue ku dhan othu varum..,left few months in hinhua ler unhappy i need to gambateh wink
7,ministers of pmln are happy like those students who never studied for exams luckily exams got delayed. they'll still ha,delhi airport drops after glitches
8,wow i just made a js compressor that actually checks scoped variables existing. turns out i have a few issues happy linters may be useful happy,i'll like her happy
9,jin was spotted at everland today. nice to see them having a break after the us tour happy,wow i just made a js compressor that actually checks scoped variables existing. turns out i have a few issues happy linters may be useful happy
