# Using TFIDF Pipeline and Classical Algorithms

In [1]:
pwd

'/run/media/kuldeepsingh/Work/college_stuff/courses/semester_3/cse_482/project'

In [2]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [3]:
# path = "/content/gdrive/My Drive/identify_the_sentiments/"

In [71]:
import pandas as pd
import numpy as np
import re
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import StackingClassifier, StackingRegressor
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import ElasticNet

import warnings
warnings.filterwarnings("ignore")

In [41]:
import matplotlib.pyplot as plt

In [42]:
# train = pd.read_csv(path + "data/train_data/train_data.csv")
# test = pd.read_csv(path + "data/test_data/test_data.csv")

train = pd.read_csv("data/train_data.csv")
# test = pd.read_csv("data/test_data.csv")


train.shape, #test.shape

((7920, 10),)

In [43]:
train.columns

Index(['id', 'label', 'tweet', 'topics', 'num_topics', 'extracted_emojis',
       'num_emojis', 'length_of_tweet', 'num_of_slurrs', 'emoji_score'],
      dtype='object')

In [44]:
train.head()

Unnamed: 0,id,label,tweet,topics,num_topics,extracted_emojis,num_emojis,length_of_tweet,num_of_slurrs,emoji_score
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,fingerprint pregnancy android apps beautiful c...,11.0,noemoji,0.0,13,0.0,0
1,2,0,Finally a transparant silicon case ^^ Thanks t...,yay sony xperia s sonyexperias…,5.0,:),1.0,17,0.0,2
2,3,0,We love this! Would you go? #talk #makememorie...,talk makememories unplug relax iphone smartpho...,8.0,noemoji,0.0,15,0.0,0
3,4,0,I'm wired I know I'm George I was made that wa...,iphone cute daventry home,4.0,;),1.0,17,0.0,0
4,5,1,What amazing service! Apple won't even talk to...,no_topics,0.0,noemoji,0.0,23,0.0,0


In [45]:
train.isnull().sum()

id                  0
label               0
tweet               0
topics              0
num_topics          0
extracted_emojis    0
num_emojis          0
length_of_tweet     0
num_of_slurrs       0
emoji_score         0
dtype: int64

In [46]:
# emoji_dict = {}
# for s in train.emoji:
#     if type(s) == str:
#         emojis = s.split()
#         for e in emojis:
#             if e not in emoji_dict.keys():
#                 emoji_dict[e] = 0
#             else:
#                 emoji_dict[e] += 1

# emoji_dict

In [47]:
def preprocess_tweet(tweet):
    #convert the tweet to lower case
    tweet = tweet.lower()
    
    #convert all urls to sting "URL"
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    
    #convert all @username to "AT_USER"
    tweet = re.sub('@[^\s]+','username', tweet)
    
    # coverting "$&@*#" to slur
    tweet = re.sub('$&@*#','profane', tweet)

    #correct all multiple white spaces to a single white space
    tweet = re.sub('[\s]+', ' ', tweet)
    
    #convert "#topic" to just "topic"
    # tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    tweet = re.sub(r'#([^\s]+)', r' ', tweet)
    
    #correct all multiple white spaces to a single white space
    tweet = re.sub('[\s]+', ' ', tweet)

    return tweet.lower().strip()


def extract_topics(tweet):
    pattern = re.compile(r'#([^\s]+)')
    matches = pattern.findall(tweet)
    topic_string = " ".join(list(set(matches)))
    if len(topic_string) < 1:
        topic_string = "no_topics"
    return topic_string.lower().strip()

def num_topics(tweet):
    pattern = re.compile(r'#([^\s]+)')
    matches = pattern.findall(tweet)
    topic_string = " ".join(matches)
    return float(len(topic_string.split()))


def extract_emojis(tweet):
    pattern = re.compile(r"(?::|;|=)(?:-)?(?:\)|\(|D|P)")
    matches = pattern.findall(tweet)
    emoji_string = " ".join(matches)
    if len(emoji_string) < 1:
        emoji_string = "noemoji"
    return emoji_string.lower().strip()

def num_emojis(string):
    pattern = re.compile(r"(?::|;|=)(?:-)?(?:\)|\(|D|P)")
    matches = pattern.findall(string)
    emoji_string = " ".join(matches)
    return float(len(emoji_string.split()))

def emoji_tokenizer(text):
    return text.strip().split()


# length of tweet
def length_of_tweet(text):
    return len(text.lower().strip().split())

# num of slurrs
def num_of_slurrs(text):
    num_of_slurrs = float(text.count("$&@*#")) + float(text.count("fuck")) + float(text.count("crap"))
    return num_of_slurrs


# emoji score 
def get_emoji_score(tweet):
    emoji_type_dict = {
        "noemoji": 0,
        ':(': -2,
        ':)': 2,
        ':-(': -2,
        ':-)': 2,
        ':-D': 2,
        ':D': 2,
        ':P': -1,
        ';)': 0,
        ';-)': 0,
        ';D': 0,
        '=(': -2,
        '=)': 2,
        '=D': 2,
        '=P': 0,
        ':-P': -1
    }
    
    pattern = re.compile(r"(?::|;|=)(?:-)?(?:\)|\(|D|P)")
    matches = pattern.findall(tweet)
    emoji_string = " ".join(matches)
    if len(emoji_string) < 1:
        emoji_string = "noemoji"
    score_list = [emoji_type_dict[e] for e in emoji_string.split()]
    return sum(score_list)

In [48]:
train.head()

Unnamed: 0,id,label,tweet,topics,num_topics,extracted_emojis,num_emojis,length_of_tweet,num_of_slurrs,emoji_score
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,fingerprint pregnancy android apps beautiful c...,11.0,noemoji,0.0,13,0.0,0
1,2,0,Finally a transparant silicon case ^^ Thanks t...,yay sony xperia s sonyexperias…,5.0,:),1.0,17,0.0,2
2,3,0,We love this! Would you go? #talk #makememorie...,talk makememories unplug relax iphone smartpho...,8.0,noemoji,0.0,15,0.0,0
3,4,0,I'm wired I know I'm George I was made that wa...,iphone cute daventry home,4.0,;),1.0,17,0.0,0
4,5,1,What amazing service! Apple won't even talk to...,no_topics,0.0,noemoji,0.0,23,0.0,0


In [49]:
tweet = train.tweet.tolist()[0]
tweet

'#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone'

In [50]:
preprocess_tweet(tweet)

'test url'

In [51]:
train["topics"] = train.tweet.apply(extract_topics)
train["num_topics"] = train.tweet.apply(num_topics)
train["extracted_emojis"] = train.tweet.apply(extract_emojis)
train["num_emojis"] = train.tweet.apply(num_emojis)
train["length_of_tweet"] = train.tweet.apply(length_of_tweet)
train["num_of_slurrs"] = train.tweet.apply(num_of_slurrs)
train["emoji_score"] = train.tweet.apply(get_emoji_score)
train["tweet_preprocessed"] = train.tweet.apply(preprocess_tweet)



# test["topics"] = test.tweet.apply(extract_topics)
# test["num_topics"] = test.tweet.apply(num_topics)
# test["extracted_emojis"] = test.tweet.apply(extract_emojis)
# test["num_emojis"] = test.tweet.apply(num_emojis)
# test["length_of_tweet"] = test.tweet.apply(length_of_tweet)
# test["num_of_slurrs"] = test.tweet.apply(num_of_slurrs)
# test["emoji_score"] = test.tweet.apply(get_emoji_score)

In [62]:
train.head()

Unnamed: 0,id,label,tweet,topics,num_topics,extracted_emojis,num_emojis,length_of_tweet,num_of_slurrs,emoji_score,tweet_preprocessed
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,pregnancy beautiful health igers cute iphoneon...,11.0,noemoji,0.0,13,0.0,0,test url
1,2,0,Finally a transparant silicon case ^^ Thanks t...,s xperia sonyexperias… yay sony,5.0,:),1.0,17,0.0,2,finally a transparant silicon case ^^ thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...,talk unplug iphone smartphone wifi relax makem...,8.0,noemoji,0.0,15,0.0,0,we love this! would you go? url
3,4,0,I'm wired I know I'm George I was made that wa...,home cute iphone daventry,4.0,;),1.0,17,0.0,0,i'm wired i know i'm george i was made that wa...
4,5,1,What amazing service! Apple won't even talk to...,no_topics,0.0,noemoji,0.0,23,0.0,0,what amazing service! apple won't even talk to...


In [72]:
# len(set([w.lower() for item in train.topics.tolist() for w in item.split()]))

# len(set([w.lower() for item in train[train.label == 0].topics.tolist() for w in item.split()]))

# len(set([w.lower() for item in train[train.label == 1].topics.tolist() for w in item.split()]))

# train[train.label == 0].num_topics.sum()

# train[train.label == 1].num_topics.sum()

# train.emoji_score.hist()

# train.num_topics.hist()

# train.num_emojis.hist()

# train.length_of_tweet.hist()

# train.num_of_slurrs.hist()

In [64]:
# numerical features
numeric_features = ["num_topics", "num_emojis", "length_of_tweet", 'num_of_slurrs', 'emoji_score']
numeric_transformer = Pipeline(
    [
        ("scaler", StandardScaler())
    ]
)


# categorical features
categorical_features = []
# categorical_transformer = OneHotEncoder(handle_unknown="ignore")


# text features
text_features = ['tweet_preprocessed', 'topics', 'extracted_emojis']
text_transformer = FeatureUnion(
    [
        (
            'tweet_tfidf',
            Pipeline(
                [
                    (
                        'extract_field',
                        FunctionTransformer(lambda x: x['tweet_preprocessed'], validate=False)
                    ),
                    (
                        'tfidf',
                        TfidfVectorizer()
                    )
                ]
            )
        ),
        (
            'topic_tfidf',
            Pipeline(
                [
                    (
                        'extract_field',
                        FunctionTransformer(lambda x: x['topics'], validate=False)
                    ),
                    (
                        'tfidf',
                        TfidfVectorizer()
                    )
                ]
            )
        ),
        (
            'emoji_tfidf',
            Pipeline(
                [
                    (
                        'extract_field',
                        FunctionTransformer(lambda x: x['extracted_emojis'], validate=False)
                    ),
                    (
                        'tfidf',
                        TfidfVectorizer()
                    )
                ]
            )
        )
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("text", text_transformer, text_features),
    ]
)

In [73]:
ensemble = VotingClassifier(
    estimators=[
        ('logistic',  LogisticRegression()),
        ("svm", SVC()),
        ("random_forest", RandomForestClassifier()),
        ('knn', KNeighborsClassifier()),
        ('ada', AdaBoostClassifier())
    ]
)
        
# steps = [('scale', RobustScaler()),
#          ('ec',ensemble)]
# ensemble_classifiers = Pipeline(steps)


ppl = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", ensemble)]
)


In [22]:
ppl

In [95]:
parameters = {
    # tweet text features
    "preprocessor__text__tweet_tfidf__tfidf__ngram_range": [(1, 2)],
    # "preprocessor__text__tweet_tfidf__tfidf__min_df": [0, 3, 5],
    # "preprocessor__text__tweet_tfidf__tfidf__max_df": [0.90, 0.95],
    
    # topic text features
    "preprocessor__text__topic_tfidf__tfidf__ngram_range": [(1, 1)],
    # "preprocessor__text__topic_tfidf__tfidf__min_df": [0, 3, 5],
    # "preprocessor__text__topic_tfidf__tfidf__max_df": [0.90, 0.95],
    
    # emoji text features
    "preprocessor__text__emoji_tfidf__tfidf__ngram_range": [(1, 1)],
    # "preprocessor__text__emoji_tfidf__tfidf__min_df": [0, 3, 5],
    # "preprocessor__text__emoji_tfidf__tfidf__max_df": [0.90, 0.95],

    # classifier features
    "classifier__logistic__C": [0.1, 1, 10],
    "classifier__logistic__penalty": ['elasticnet'],
    "classifier__logistic__solver": ['saga'],
    "classifier__logistic__l1_ratio": [0.5],
    'classifier__logistic__class_weight': ['balanced'],

    "classifier__svm__kernel": ["linear"],
    "classifier__svm__C": [0.1, 1, 10],
    'classifier__random_forest__class_weight': ['balanced'],
    
    'classifier__random_forest__n_estimators': [500],
    # 'classifier__random_forest__max_depth': [500],
    'classifier__random_forest__class_weight': ['balanced'],
    
    'classifier__knn__n_neighbors': [5],
    'classifier__knn__weights': ['uniform', 'distance'],
    
    'classifier__ada__n_estimators': [500],
    'classifier__ada__learning_rate': [0.1, 1, 5]
}

In [90]:
total_fits = 10
for k, v in parameters.items():
    total_fits *= len(v)
    
total_fits

540

In [91]:
train.head()

Unnamed: 0,id,label,tweet,topics,num_topics,extracted_emojis,num_emojis,length_of_tweet,num_of_slurrs,emoji_score,tweet_preprocessed
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,pregnancy beautiful health igers cute iphoneon...,11.0,noemoji,0.0,13,0.0,0,test url
1,2,0,Finally a transparant silicon case ^^ Thanks t...,s xperia sonyexperias… yay sony,5.0,:),1.0,17,0.0,2,finally a transparant silicon case ^^ thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...,talk unplug iphone smartphone wifi relax makem...,8.0,noemoji,0.0,15,0.0,0,we love this! would you go? url
3,4,0,I'm wired I know I'm George I was made that wa...,home cute iphone daventry,4.0,;),1.0,17,0.0,0,i'm wired i know i'm george i was made that wa...
4,5,1,What amazing service! Apple won't even talk to...,no_topics,0.0,noemoji,0.0,23,0.0,0,what amazing service! apple won't even talk to...


In [96]:
# train the classifier
clf = GridSearchCV(ppl, parameters, cv=10, scoring="f1", verbose=1, n_jobs=-1)
clf.fit(train, train["label"])

Fitting 10 folds for each of 54 candidates, totalling 540 fits


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mo

In [None]:
from sklearn.utils import estimator_html_repr

with open("pipeline_image.html", "w") as f:
    f.write(estimator_html_repr(ppl))

In [97]:
print("Gridsearch score is {}".format(clf.best_score_))
# print("Score on test set is {}".format(clf.score(test, test["label"])))

Gridsearch score is 0.8206568398261513


In [100]:
clf.best_params_

{'classifier__ada__learning_rate': 1,
 'classifier__ada__n_estimators': 500,
 'classifier__knn__n_neighbors': 5,
 'classifier__knn__weights': 'uniform',
 'classifier__logistic__C': 1,
 'classifier__logistic__class_weight': 'balanced',
 'classifier__logistic__l1_ratio': 0.5,
 'classifier__logistic__penalty': 'elasticnet',
 'classifier__logistic__solver': 'saga',
 'classifier__random_forest__class_weight': 'balanced',
 'classifier__random_forest__n_estimators': 500,
 'classifier__svm__C': 1,
 'classifier__svm__kernel': 'linear',
 'preprocessor__text__emoji_tfidf__tfidf__ngram_range': (1, 1),
 'preprocessor__text__topic_tfidf__tfidf__ngram_range': (1, 1),
 'preprocessor__text__tweet_tfidf__tfidf__ngram_range': (1, 2)}

In [83]:
parameters = {
    # tweet text features
    "preprocessor__text__tweet_tfidf__tfidf__ngram_range": [(1, 2)],
    # "preprocessor__text__tweet_tfidf__tfidf__min_df": [0, 3, 5],
    # "preprocessor__text__tweet_tfidf__tfidf__max_df": [0.90, 0.95],
    
    # topic text features
    "preprocessor__text__topic_tfidf__tfidf__ngram_range": [(1, 1)],
    # "preprocessor__text__topic_tfidf__tfidf__min_df": [0, 3, 5],
    # "preprocessor__text__topic_tfidf__tfidf__max_df": [0.90, 0.95],
    
    # emoji text features
    "preprocessor__text__emoji_tfidf__tfidf__ngram_range": [(1, 1)],
    # "preprocessor__text__emoji_tfidf__tfidf__min_df": [0, 3, 5],
    # "preprocessor__text__emoji_tfidf__tfidf__max_df": [0.90, 0.95],

    # classifier features
    "classifier__logistic__C": [1],
    "classifier__logistic__penalty": ['elasticnet'],
    "classifier__logistic__solver": ['saga'],
    "classifier__logistic__l1_ratio": [0.5],
    'classifier__logistic__class_weight': ['balanced'],

    "classifier__svm__kernel": ["linear"],
    "classifier__svm__C": [1],
    'classifier__random_forest__class_weight': ['balanced'],
    
    'classifier__random_forest__n_estimators': [500],
    # 'classifier__random_forest__max_depth': [500],
    'classifier__random_forest__class_weight': ['balanced'],
    
    'classifier__knn__n_neighbors': [5],
    'classifier__knn__weights': ['uniform'],
    
    'classifier__ada__n_estimators': [500],
    'classifier__ada__learning_rate': [1]
}

In [101]:
ppl = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", ensemble)]
)

In [103]:
# Update the pipeline with the best parameters
for param_name in clf.best_params_:
    step, param = param_name.split('__', 1)
    setattr(ppl.named_steps[step], param, clf.best_params_[param_name])


In [105]:
ppl.fit(train, train["label"])

In [110]:
test_data = pd.read_csv('test_oJQbWVk.csv')
print(test_data.shape)
test_data.head()

(1953, 2)


Unnamed: 0,id,tweet
0,7921,I hate the new #iphone upgrade. Won't let me d...
1,7922,currently shitting my fucking pants. #apple #i...
2,7923,"I'd like to puts some CD-ROMS on my iPad, is t..."
3,7924,My ipod is officially dead. I lost all my pict...
4,7925,Been fighting iTunes all night! I only want th...


In [107]:
test_data["topics"] = test_data.tweet.apply(extract_topics)
test_data["num_topics"] = test_data.tweet.apply(num_topics)
test_data["extracted_emojis"] = test_data.tweet.apply(extract_emojis)
test_data["num_emojis"] = test_data.tweet.apply(num_emojis)
test_data["length_of_tweet"] = test_data.tweet.apply(length_of_tweet)
test_data["num_of_slurrs"] = test_data.tweet.apply(num_of_slurrs)
test_data["emoji_score"] = test_data.tweet.apply(get_emoji_score)
test_data["tweet_preprocessed"] = test_data.tweet.apply(preprocess_tweet)

test_data.head()

Unnamed: 0,id,tweet,topics,num_topics,extracted_emojis,num_emojis,length_of_tweet,num_of_slurrs,emoji_score,tweet_preprocessed
0,7921,I hate the new #iphone upgrade. Won't let me d...,apple iphone ugh,3.0,noemoji,0.0,14,0.0,0,i hate the new upgrade. won't let me download ...
1,7922,currently shitting my fucking pants. #apple #i...,swagswagswag imac raddest apple cashmoney,5.0,noemoji,0.0,11,1.0,0,currently shitting my fucking pants. url
2,7923,"I'd like to puts some CD-ROMS on my iPad, is t...",no_topics,0.0,noemoji,0.0,20,0.0,0,"i'd like to puts some cd-roms on my ipad, is t..."
3,7924,My ipod is officially dead. I lost all my pict...,sobbing hatinglife,2.0,noemoji,0.0,23,0.0,0,my ipod is officially dead. i lost all my pict...
4,7925,Been fighting iTunes all night! I only want th...,no_topics,0.0,noemoji,0.0,14,1.0,0,been fighting itunes all night! i only want th...


In [108]:
predictions = ppl.predict(test_data)

In [109]:
predictions.shape

(1953,)

In [113]:
submit_df = pd.DataFrame()

submit_df['id'] = test_data['id']
submit_df['label'] = predictions

print(submit_df.shape)
submit_df.head()

(1953, 2)


Unnamed: 0,id,label
0,7921,1
1,7922,0
2,7923,1
3,7924,1
4,7925,1


In [114]:
submit_df.to_csv('first_submission.csv', index=False)

In [83]:
# train.to_csv('data/train_data.csv', index=False)

In [103]:
def create_df(text):
    df = pd.DataFrame()
    df['tweet'] = [text]
    df["topics"] = df.tweet.apply(extract_topics)
    df["num_topics"] = df.tweet.apply(num_topics)
    df["extracted_emojis"] = df.tweet.apply(extract_emojis)
    df["num_emojis"] = df.tweet.apply(num_emojis)
    df["length_of_tweet"] = df.tweet.apply(length_of_tweet)
    df["num_of_slurrs"] = df.tweet.apply(num_of_slurrs)
    df["emoji_score"] = df.tweet.apply(get_emoji_score)
    df['tweet'] = df.tweet.apply(preprocess_tweet)
    return df

In [104]:
df = create_df('I am feeling good today')
df.head()

Unnamed: 0,tweet,topics,num_topics,extracted_emojis,num_emojis,length_of_tweet,num_of_slurrs,emoji_score
0,i am feeling good today,no_topics,0.0,noemoji,0.0,5,0.0,0
