In [10]:
import numpy as np
import pandas as pd

# nltk.download('all')
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import words, stopwords, wordnet
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

# suppress deprecation warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# import python helper functions
from helpers import *

In [2]:
directory = "F:\Ingenieur_IA\P7_sentiment_analysis"
data = pd.read_csv("../data/tweets.csv", delimiter=",", encoding = "ISO-8859-1", header=None)

In [3]:
data = data[[0, 5]]
data = data.rename(columns={0: "target", 5: "tweet"})
data

Unnamed: 0,target,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


In [4]:
print("0 represents negative reviews, 4 represents positive reviews")
data.target.value_counts()

0 represents negative reviews, 4 represents positive reviews


0    800000
4    800000
Name: target, dtype: int64

In [5]:
sample_df = pd.DataFrame()

for target in [0,4]:
    target_df = data[data["target"] == target].sample(1000, random_state=42)
    sample_df = sample_df.append(target_df, ignore_index=True)

In [6]:
sample_df

Unnamed: 0,target,tweet
0,0,@xnausikaax oh no! where did u order from? tha...
1,0,A great hard training weekend is over. a coup...
2,0,"Right, off to work Only 5 hours to go until I..."
3,0,I am craving for japanese food
4,0,Jean Michel Jarre concert tomorrow gotta work...
...,...,...
1995,4,@abidabbidoos i will try and make myself reali...
1996,4,Feeling lazy... just cooking rice + egg for di...
1997,4,@MartinAmis &quot;IT'S PERFECT THE WAY IT IS!!...
1998,4,@gloriabell Crazy in the most wonderful kind o...


In [7]:
sample_df["clean_tweet"] = sample_df["tweet"].apply(lambda row : text_cleaner(row))

In [8]:
sample_df

Unnamed: 0,target,tweet,clean_tweet
0,0,@xnausikaax oh no! where did u order from? tha...,"[order, horrible]"
1,0,A great hard training weekend is over. a coup...,"[great, hard, training, weekend, couple, rest,..."
2,0,"Right, off to work Only 5 hours to go until I...","[right, work, free]"
3,0,I am craving for japanese food,[craving]
4,0,Jean Michel Jarre concert tomorrow gotta work...,"[jean, concert, tomorrow, work, though]"
...,...,...,...
1995,4,@abidabbidoos i will try and make myself reali...,"[make, realize, importance, tied]"
1996,4,Feeling lazy... just cooking rice + egg for di...,"[feeling, lazy, cooking, rice, dinner, drizzle..."
1997,4,@MartinAmis &quot;IT'S PERFECT THE WAY IT IS!!...,"[quot, perfect, quot, wrong, joke]"
1998,4,@gloriabell Crazy in the most wonderful kind o...,"[crazy, wonderful, kind]"


In [12]:
sample_df["docs"] = sample_df.clean_tweet.apply(lambda x : " ".join(x))
sample_df.head(5)

Unnamed: 0,target,tweet,clean_tweet,docs
0,0,@xnausikaax oh no! where did u order from? tha...,"[order, horrible]",order horrible
1,0,A great hard training weekend is over. a coup...,"[great, hard, training, weekend, couple, rest,...",great hard training weekend couple rest comput...
2,0,"Right, off to work Only 5 hours to go until I...","[right, work, free]",right work free
3,0,I am craving for japanese food,[craving],craving
4,0,Jean Michel Jarre concert tomorrow gotta work...,"[jean, concert, tomorrow, work, though]",jean concert tomorrow work though


In [20]:
sample_df["docs"] = sample_df.clean_tweet.apply(lambda x : " ".join(x))

tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1,2))
tfidf_vectors = tfidf.fit_transform(sample_df.docs)

data_tfidf = pd.DataFrame(tfidf_vectors.toarray(), columns=tfidf.get_feature_names_out())
# data_tfidf.index = data.index

print("Shape of tf-idf matrix: " + str(data_tfidf.shape))
data_tfidf.head(5)

Shape of tf-idf matrix: (2000, 1000)


Unnamed: 0,able,absolutely,accent,account,actually,addicted,adorable,afternoon,agree,airport,...,would cool,write,writing,wrong,yeah,year,yellow,yesterday,young,yummy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
