<a href="https://colab.research.google.com/github/jfr11101/MyProjects/blob/main/Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pickle
import numpy as np
import nltk, re, string
from nltk.corpus import stopwords, twitter_samples


In [18]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [19]:
def process_tweet(tweet):
  stemmer = nltk.PorterStemmer()
  stopwords_english = stopwords.words('english')
  tweet = re.sub(r'\$\w*', '', tweet)
  tweet = re.sub(r'^RT[\s]+', '', tweet)
  tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
  tweet = re.sub(r'#', '', tweet)
  tokenizer = nltk.TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
  tweet_tokens = tokenizer.tokenize(tweet)

  tweets_clean = []
  for word in tweet_tokens:
    if (word not in stopwords_english and word not in string.punctuation):
      stem_word = stemmer.stem(word)
      tweets_clean.append(stem_word)

  return tweets_clean


In [20]:
def build_freqs(tweets, ys):
  yslist = np.squeeze(ys).tolist()
  freqs = {}
  for y, tweet in zip(yslist, tweets):
    for word in process_tweet(tweet):
      pair = (word, y)
      if pair in freqs:
        freqs[pair] += 1
      else:
        freqs[pair] = 1

  return freqs

In [21]:
tweets = ['I am happy', 'I am tricked', 'I am sad', 'I am tired', 'I am tired']
ys = [1,0,0,0,0]
res = build_freqs(tweets, ys)
print(res)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}


In [24]:
nltk.download("twitter_samples")

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [25]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [26]:
train_pos = all_positive_tweets[:4000]
test_pos = all_positive_tweets[4000:]
train_neg = all_negative_tweets[:4000]
test_neg = all_negative_tweets[4000:]

In [27]:
X_train = train_pos + train_neg
X_test = test_pos + test_neg

In [29]:
y_train = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
y_test = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [30]:
freqs = build_freqs(X_train, y_train)

In [31]:
type(freqs)

dict

In [33]:
len(freqs.keys())

11337

In [34]:
X_train[22]

'@gculloty87 Yeah I suppose she was lol! Chat in a bit just off out x :))'

In [35]:
process_tweet(X_train[22])

['yeah', 'suppos', 'lol', 'chat', 'bit', 'x', ':)']

In [37]:
def sigmoid(z):
  zz = np.negative(z)
  h = 1 / (1+np.exp(zz))

  return h

In [38]:
def gradDiscent(x, y, tetha, alpha, num_items):
  m = x.shape[0]
  for i in range(0, num_items):
    z = np.dot(x, tetha)
    h = sigmoid(z)
    cost = -1/m*(np.dot(y.transpose(), np.log(h))+np.dot((1-y).transpose(), np.log(1-h)))
    tetha = tetha - (alpha/m)*np.dot(x.transpose(), (h-y))

  cost = float(cost)
  return cost, tetha

In [39]:
def extract_features(tweet, freqs):
  word_l = process_tweet(tweet)
  x = np.zeros((1,3))

  x[0,0] = 1
  for word in word_l:
    x[0,1] += freqs.get((word, 1.0), 0)
    x[0,2] += freqs.get((word, 0.0), 0)

  assert(x.shape == (1,3))

  return x

In [40]:
temp = extract_features(X_train[22], freqs)

In [42]:
print(temp)

[[1.000e+00 3.006e+03 1.240e+02]]


#Training Model

In [46]:
X = np.zeros((len(X_train), 3))
for i in range(len(X_train)):
  X[i, :] = extract_features(X_train[22], freqs)

Y = y_train
J, theta = gradDiscent(X, Y, np.zeros((3,1)), 1e-9, 1500)

In [47]:
def predict_tweet(tweet, freqs, theta):
  x = extract_features(tweet, freqs)
  y_pred = sigmoid(np.dot(x, theta))

  return y_pred

In [50]:
def test_logistic_regression(X_test, y_test, freqs, theta):
  y_hat = []
  for tweet in X_test:
    y_pred = predict_tweet(tweet, freqs, theta)
    if y_pred > 0.5:
      y_hat.append(1)
    else:
      y_hat.append(0)

  accuracy = (y_hat == np.squeeze(y_test)).sum()/len(X_test)

  return accuracy

In [51]:
tmp_accuacy = test_logistic_regression(X_test, y_test, freqs, theta)
print(f'Logistic regression model accuracy: {tmp_accuacy:.4f}')

Logistic regression model accuracy: 0.5000


In [57]:
def pred(sentence):
  yhat = predict_tweet(sentence, freqs, theta)
  if yhat >= 0.5:
    return 'positive sentiment'
  else:
    return 'negative seniment'

In [58]:
my_tweet = 'It is so hot today but it is the perfect day for a beach party'
res = pred(my_tweet)
print(res)

positive sentiment
