In [20]:
import re
import os
import nltk
import pandas as pd
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('movie_reviews')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/arie/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/arie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/arie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
def clean_up(s):
    """
    Cleans up numbers, URLs, and special characters from a string.
    Args:
        s: The string to be cleaned up.
    Returns:
        A string that has been cleaned up.
    """
    comp = re.compile('(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+|[\W\d]')
    return (re.sub(comp," ",s)).lower()

In [22]:
def tokenize(s):
    """
    Tokenize a string.
    Args:
        s: String to be tokenized.
    Returns:
        A list of words as the result of tokenization.
    """
    return word_tokenize(s)

In [23]:
def stem_and_lemmatize(l):
    """
    Perform stemming and lemmatization on a list of words.
    Args:
        l: A list of strings.
    Returns:
        A list of strings after being stemmed and lemmatized.
    """
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in l]


In [24]:
def remove_stopwords(l):
    """
    Remove English stopwords from a list of strings.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after stop words are removed.
    """
    stopWords = set(stopwords.words('english'))
    filtered = [word for word in l if word not in stopWords and len(word)>1]
    return filtered

In [25]:
df = pd.read_csv("Sentiment140.csv")

In [26]:
# 0 = negative, 4 = positive
df.head()
df = df.sample(1500)
df.target.value_counts()

4    779
0    721
Name: target, dtype: int64

In [27]:
df["text_processed"] = df["text"].apply(clean_up)
df["text_processed"] = df["text_processed"].apply(tokenize)
df["text_processed"] = df["text_processed"].apply(stem_and_lemmatize)
df["text_processed"]=df["text_processed"].apply(remove_stopwords)

In [28]:
words = df["text_processed"].tolist()

In [29]:
all_words = [word2 for word in words for word2 in word]
all_words2 = nltk.FreqDist(all_words)
word_features = list(all_words2.keys())[:500]

In [30]:
df_util = df[["id","target","text_processed"]]
df_util["target"]= df_util["target"].astype("category")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [31]:
palabras = df_util["text_processed"].tolist()
target = df_util["target"].tolist()
documents = list(zip(palabras,target))

In [32]:
def find_features(document):
    word = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
        
    return features

In [33]:
features_set = [(find_features(doc),target) for (doc,target) in documents]

In [45]:
features_set[9]

({'greggrunberg': False,
  'deaf': False,
  'wait': False,
  'star': False,
  'trek': False,
  'dvd': False,
  'arrive': False,
  'later': False,
  'wizzzle': False,
  'im': False,
  'bored': False,
  'dunno': False,
  'nose': False,
  'stuffed': False,
  'want': False,
  'hate': False,
  'wasting': False,
  'gorgeous': False,
  'day': False,
  'sick': False,
  'season': False,
  'office': False,
  'funny': False,
  'jim': False,
  'mess': False,
  'dwight': False,
  'hot': False,
  'self': False,
  'oral': False,
  'surgen': False,
  'found': False,
  'blood': False,
  'clot': False,
  'poked': False,
  'mouth': False,
  'hurt': False,
  'man': False,
  'cried': False,
  'antibotics': False,
  'made': False,
  'peanut': False,
  'butter': False,
  'buckwheat': False,
  'pancake': False,
  'amp': False,
  'strawberry': False,
  'blueberry': False,
  'granola': False,
  'vanilla': False,
  'yogurt': False,
  'parfait': False,
  'breakfast': False,
  'yum': False,
  'gagging': False,
  '

In [37]:
training_set = features_set[:1400]
testing_set = features_set[1400:]
classifier = nltk.NaiveBayesClassifier.train(training_set)
nltk.classify.accuracy(classifier,testing_set)*100

46.0

In [41]:
df_util.head(10)

Unnamed: 0,id,target,text_processed
92287,1759964968,0,"[greggrunberg, deaf, wait, star, trek, dvd, ar..."
700636,2254825925,0,"[wizzzle, im, bored, dunno]"
318495,2002833903,0,"[nose, stuffed, want, hate, wasting, gorgeous,..."
771810,2302515240,0,"[season, office, funny, jim, office, mess, dwi..."
565883,2206657060,0,"[oral, surgen, found, blood, clot, poked, mout..."
959636,1826102402,4,"[made, peanut, butter, buckwheat, pancake, amp..."
481793,2179777105,0,"[gagging, love, take, phone]"
1484500,2067683800,4,"[hey, hey, tweet, good, nite, last, go, maybe,..."
235031,1979704809,0,"[jack_of_clubs, la, roux, better, sound, summe..."
810220,1469703214,4,"[sound, george, working, good]"
