In [1]:
import pandas as pd
import regex as re
import nltk
import preprocessor as p
import joblib
import pickle

In [2]:
data_array = ["art2.csv", "game2.csv", "movie2.csv","music2.csv","politic2.csv","science2.csv","sport2.csv","technology2.csv","travel2.csv","TV2.csv"]

In [3]:
data = []
for i in data_array:
    data.append(pd.read_csv(i))
df = pd.concat(data)
#df.drop(["index"], inplace=True)
df = df.reset_index(drop = True)

In [4]:
df.head()

Unnamed: 0,0,1
0,Today's #DailyDeviation winners are here! Ft: ...,art
1,Check out today's #DailyDeviation winners! Ft:...,art
2,ANNOUNCEMENT! My very first @Kickstarter! **Si...,art
3,Geometric Pattern: Intersect Square: Black/Bat...,art
4,Geometric Pattern: Intersect Square: Black/Bat...,art


In [5]:
df = df.rename(columns={'0': "tweet_text", '1' : "category"})

In [6]:
#new columns for all hashtags
df['hashtags'] = df['tweet_text'].apply(lambda x: re.findall(r"#(\w+)", x))

In [7]:
df.head()

Unnamed: 0,tweet_text,category,hashtags
0,Today's #DailyDeviation winners are here! Ft: ...,art,"[DailyDeviation, Art]"
1,Check out today's #DailyDeviation winners! Ft:...,art,"[DailyDeviation, Art]"
2,ANNOUNCEMENT! My very first @Kickstarter! **Si...,art,"[warlock, BATTLEOFTHEBARDS, dnd, bards, art, s..."
3,Geometric Pattern: Intersect Square: Black/Bat...,art,"[art, geometricpattern, geometry, retro, circl..."
4,Geometric Pattern: Intersect Square: Black/Bat...,art,"[art, geometricpattern, geometry, retro, circl..."


In [8]:
#remove just hashtag tag then using tweet-preprocessor for links, emoticons, etc. then remove punctuations
for i in range(len(df["tweet_text"])):
    #df["tweet_text"][i] = re.sub(r'(?is)#', '', df["tweet_text"][i])
    df["tweet_text"][i] = p.clean(df["tweet_text"][i])
    df["tweet_text"][i] = re.sub(r'[^\w\s]','',df["tweet_text"][i])

In [9]:
df["tweet_text"][1]

'Check out todays winners Ft    and many moreCheck out all of the winning here'

## Normalization and stemming and Lemmatization

In [10]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
stop_words.update(('new','year', 'happy', 'first', 'one', 'lets', 'best', 'via', 'two', 'three', 'amp'))
for i in range(len(df["tweet_text"])):
    word_tokens = word_tokenize(df["tweet_text"][i].lower())
    filtered_sentence = ""
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence += w
            filtered_sentence += " "
    filtered_sentence = filtered_sentence.strip()
    df["tweet_text"][i] = filtered_sentence

In [11]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer 
for i in range(len(df["tweet_text"])):
    sentence = ""
    tweet = df["tweet_text"][i].lower().split()
    porter = nltk.SnowballStemmer('english')
    WNlemma = WordNetLemmatizer()
    for k in tweet:
        stemming = porter.stem(k)
        lem = WNlemma.lemmatize(stemming)
        sentence += lem
        sentence += " "
    sentence = sentence.strip()
    df["tweet_text"][i] = sentence

In [12]:
df2 = df.copy()

In [13]:
categories = df["category"].unique().tolist()
for i in categories:
    df2[i] = 0

In [14]:
for cat in categories:
    for i in range(len(df2[cat])):
        if df2["category"][i]  == cat:
            df2[cat][i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[cat][i] = 1


In [15]:
df2.head()

Unnamed: 0,tweet_text,category,hashtags,art,game,movie,music,politic,science,sport,technology,travel,TV
0,today winner ft mani morecheck win,art,"[DailyDeviation, Art]",1,0,0,0,0,0,0,0,0,0
1,check today winner ft mani morecheck win,art,"[DailyDeviation, Art]",1,0,0,0,0,0,0,0,0,0
2,announc siren song battl bard live april partn...,art,"[warlock, BATTLEOFTHEBARDS, dnd, bards, art, s...",1,0,0,0,0,0,0,0,0,0
3,geometr pattern intersect squar blackbattleshi...,art,"[art, geometricpattern, geometry, retro, circl...",1,0,0,0,0,0,0,0,0,0
4,geometr pattern intersect squar blackbattleshi...,art,"[art, geometricpattern, geometry, retro, circl...",1,0,0,0,0,0,0,0,0,0


### Frequency of words

In [16]:
from nltk.probability import FreqDist
from nltk import word_tokenize

fdist = FreqDist()

In [17]:
for i in categories: 
    freq = FreqDist(sum(df2[df2["category"]== i].tweet_text.map(word_tokenize), []))
    sort_orders = sorted(freq.items(), key=lambda x: x[1], reverse=True)
    count = 0
    print(i + ": ", end= " ")
    for k in sort_orders:
        print(k[0], k[1], end= " ")
        count += 1
        if count == 10:
            break
    print("\n")

art:  art 270 amaz 237 commiss 106 day 104 help 96 love 95 piper 93 time 92 go 91 draw 91 

game:  play 490 game 428 year 290 day 273 special 246 everi 223 prepar 211 gift 209 obey 207 yearthank 206 

movie:  movi 438 watch 202 film 191 alan 114 review 114 full 111 last 107 found 99 footag 91 vintag 90 

music:  music 318 song 177 play 170 live 146 check 134 video 117 love 106 tune 105 listen 101 come 92 

politic:  presid 234 trump 216 vote 216 elect 144 senat 126 stimulus 108 bill 108 u 108 veto 108 calif 108 

science:  u 370 follow 218 scienc 216 learn 200 day 147 creat 136 get 121 would 121 machin 119 az 117 

sport:  sport 207 anoth 159 post 150 favorit 147 insta 145 make 127 thank 125 support 120 live 119 sign 110 

technology:  use 117 day 111 project 101 free 86 javascript 82 technolog 80 code 79 intellig 78 ai 77 get 76 

travel:  travel 293 u 147 check 101 experi 100 visit 96 stay 96 amaz 94 beauti 94 slide 92 thing 92 

TV:  tv 356 along 192 see 184 watch 164 much 153 show 

### Train-Test Split

In [18]:
from sklearn.model_selection import train_test_split

#X = df2["tweet_text"]
#y = df2.drop(columns = ["tweet_text", "category", "hashtags"])

train, test = train_test_split(df2, random_state=42, test_size=0.2)
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [19]:
train_text = train['tweet_text']
test_text = test['tweet_text']

In [20]:
df2.to_csv("preprocessed_data.csv", index=False)

### TF-IDF

In [21]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
#vectorizer = CountVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,2))
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,2), norm='l2')
vectorizer.fit(train_text)
vectorizer.fit(test_text)


TfidfVectorizer(ngram_range=(1, 2), strip_accents='unicode')

In [22]:
x_train = vectorizer.transform(train_text)
y_train = train.drop(columns = ["tweet_text", "category", "hashtags"])

x_test = vectorizer.transform(test_text)
y_test = test.drop(columns = ["tweet_text", "category", "hashtags"])

### Multi-Label ClassificationMulti-Label Classification

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

In [24]:
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])

for category in categories:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[category])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print("\n")

**Processing art comments...**
Test accuracy is 0.9343454258675079


**Processing game comments...**
Test accuracy is 0.9337539432176656


**Processing movie comments...**
Test accuracy is 0.9380914826498423


**Processing music comments...**
Test accuracy is 0.9152208201892744


**Processing politic comments...**
Test accuracy is 1.0


**Processing science comments...**
Test accuracy is 0.9361198738170347


**Processing sport comments...**
Test accuracy is 0.936711356466877


**Processing technology comments...**
Test accuracy is 0.9378943217665615


**Processing travel comments...**
Test accuracy is 0.9240930599369085


**Processing TV comments...**
Test accuracy is 0.9589905362776026




In [25]:
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None), n_jobs=-1)),
            ])

for category in categories:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[category])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print("\n")

**Processing art comments...**
Test accuracy is 0.9349369085173501


**Processing game comments...**
Test accuracy is 0.9481466876971609


**Processing movie comments...**
Test accuracy is 0.9430205047318612


**Processing music comments...**
Test accuracy is 0.9305993690851735


**Processing politic comments...**
Test accuracy is 0.9970425867507886


**Processing science comments...**
Test accuracy is 0.951301261829653


**Processing sport comments...**
Test accuracy is 0.95051261829653


**Processing technology comments...**
Test accuracy is 0.9536671924290221


**Processing travel comments...**
Test accuracy is 0.9376971608832808


**Processing TV comments...**
Test accuracy is 0.9700315457413249




In [26]:
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(CalibratedClassifierCV(LinearSVC()), n_jobs=-1)),
            ])

for category in categories:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[category])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print("\n")

**Processing art comments...**
Test accuracy is 0.9656940063091483


**Processing game comments...**
Test accuracy is 0.9725946372239748


**Processing movie comments...**
Test accuracy is 0.9712145110410094


**Processing music comments...**
Test accuracy is 0.9524842271293376


**Processing politic comments...**
Test accuracy is 1.0


**Processing science comments...**
Test accuracy is 0.9674684542586751


**Processing sport comments...**
Test accuracy is 0.9704258675078864


**Processing technology comments...**
Test accuracy is 0.9800867507886435


**Processing travel comments...**
Test accuracy is 0.9572160883280757


**Processing TV comments...**
Test accuracy is 0.9982255520504731




In [223]:
pickle.dump(vectorizer, open("vector.pickel", "wb"))

In [25]:
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None), n_jobs=-1)),
            ])
with open("models2.pckl", "wb") as f:
    for category in categories:
        LogReg_pipeline.fit(x_train, train[category])
        pickle.dump(LogReg_pipeline, f)