In [None]:
# use your own kaggle.json
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/

# change the permission
!chmod 600 ~/.kaggle/kaggle.json

# download dataset
!kaggle datasets download -d krishbaisoya/tweets-sentiment-analysis

# unzip the dataset
!unzip tweets-sentiment-analysis.zip

In [None]:
"""
Use the data to train a model to predict the sentiment of a tweet, either positive or negative.
"""

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import re
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
%matplotlib inline

# load the data
training_data = pd.read_csv('train_data.csv')
testing_data = pd.read_csv('test_data.csv')

# check the data
training_data.head()

# check the data
testing_data.head()

# check the shape of the data
print("Training data shape: ", training_data.shape)
print("Testing data shape: ", testing_data.shape)

# check the null values
training_data.isnull().sum()

# check the null values
testing_data.isnull().sum()

# unique values in the sentiment column
training_data['Sentiment'].unique()

# plot the distribution of sentiment labels in the training data
sns.countplot(x='sentiment', data=training_data)
plt.xlabel('Sentiment Label')
plt.ylabel('Count')
plt.title('Distribution of Sentiment Labels in Training Data')
plt.xticks([0, 1], ['Negative', 'Positive'])
plt.show()


# plot the distribution of sentiment labels in the testing data
sns.countplot(x='sentiment', data=testing_data)
plt.xlabel('Sentiment Label')
plt.ylabel('Count')
plt.title('Distribution of Sentiment Labels in Testing Data')
plt.xticks([0, 1], ['Negative', 'Positive'])
plt.show()

# preprocess the data
training_data['sentence'] = training_data['sentence'].apply(lambda x: x.lower())
training_data['sentence'] = training_data['sentence'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

testing_data['sentence'] = testing_data['sentence'].apply(lambda x: x.lower())
testing_data['sentence'] = testing_data['sentence'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string

# stopwords removal
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

punctuation = list(string.punctuation)


# remove stopwords from the sentence
def remove_stopwords(sentence):
    sentence = sentence.split()
    sentence = [word for word in sentence if word not in stop_words]
    sentence = " ".join(sentence)
    return sentence

# lowercase the sentence
def lowercase(sentence):
    sentence = sentence.lower()
    return sentence

# remove punctuation from the sentence
def remove_punctuation(sentence):
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    return sentence

# remove numbers from the sentence
def remove_numbers(sentence):
    sentence = re.sub(r'\d+', '', sentence)
    return sentence

# remove extra spaces from the sentence
def remove_extra_spaces(sentence):
    sentence = re.sub(' +', ' ', sentence)
    return sentence

# remove stopwords from the sentence
def remove_stopwords(sentence):
    sentence = sentence.split()
    sentence = [word for word in sentence if word not in stop_words]
    sentence = " ".join(sentence)
    return sentence

# tokenize the sentence
def tokenize(sentence):
    tweet_tokenizer = nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)
    sentence = tweet_tokenizer.tokenize(sentence)
    sentence = " ".join(sentence)
    return sentence

# preprocess the data
training_data['sentence'] = training_data['sentence'].apply(lowercase)
training_data['sentence'] = training_data['sentence'].apply(remove_punctuation)
training_data['sentence'] = training_data['sentence'].apply(remove_numbers)
training_data['sentence'] = training_data['sentence'].apply(remove_extra_spaces)
training_data['sentence'] = training_data['sentence'].apply(remove_stopwords)
training_data['sentence'] = training_data['sentence'].apply(tokenize)

testing_data['sentence'] = testing_data['sentence'].apply(lowercase)
testing_data['sentence'] = testing_data['sentence'].apply(remove_punctuation)
testing_data['sentence'] = testing_data['sentence'].apply(remove_numbers)
testing_data['sentence'] = testing_data['sentence'].apply(remove_extra_spaces)
testing_data['sentence'] = testing_data['sentence'].apply(remove_stopwords)
testing_data['sentence'] = testing_data['sentence'].apply(tokenize)


# print before and after preprocessing
print("Before preprocessing: ", training_data['sentence'][0])
print("After preprocessing: ", training_data['sentence'][0])


# split the data into train and validation set
train_x, valid_x, train_y, valid_y = train_test_split(training_data['sentence'], training_data['sentiment'], test_size=0.2, random_state=42)

# tokenize the data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
train_x_tfidf = tfidf_vectorizer.fit_transform(train_x)
valid_x_tfidf = tfidf_vectorizer.transform(valid_x)

# train the model
logistic_regression = LogisticRegression()
logistic_regression.fit(train_x_tfidf, train_y)

# predict the sentiment of the validation set
predictions = logistic_regression.predict(valid_x_tfidf)

# print the classification report
print(classification_report(valid_y, predictions))

# print the confusion matrix
print(confusion_matrix(valid_y, predictions))


# predict the sentiment of the testing set
test_x_tfidf = tfidf_vectorizer.transform(testing_data['sentence'])
predictions = logistic_regression.predict(test_x_tfidf)

# print the classification report
print(classification_report(testing_data['sentiment'], predictions))

# save the model
pickle.dump(logistic_regression, open('logistic_regression.pkl', 'wb'))
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pkl', 'wb'))

# load the model
logistic_regression = pickle.load(open('logistic_regression.pkl', 'rb'))
tfidf_vectorizer = pickle.load(open('tfidf_vectorizer.pkl', 'rb'))

# predict the sentiment from the user input
def predict_sentiment(user_input):
    user_input = lowercase(user_input)
    user_input = remove_punctuation(user_input)
    user_input = remove_numbers(user_input)
    user_input = remove_extra_spaces(user_input)
    user_input = remove_stopwords(user_input)
    user_input = tokenize(user_input)
    user_input = tfidf_vectorizer.transform([user_input])
    prediction = logistic_regression.predict(user_input)
    return prediction

# get the user input
user_input = input("Enter a sentence: ")

# predict the sentiment of the user input
prediction = predict_sentiment(user_input)

# print the sentiment
if prediction == 0:
    print("Negative")
else:
    print("Positive")


In [None]:
# now i will mine data from twitter using tweepy and then i will predict the sentiment of the tweets

# import the libraries
#import dependencies
import tweepy
from tweepy import OAuthHandler
import json
from unidecode import unidecode
import time
import datetime
from tqdm import tqdm 
from decouple import config

# Authenticate to Twitter
import tweepy


# Authenticate to Twitter
api_key= config('api_key')
api_key_secret= config('api_key_secret')
bearer_token= config('bearer_token')
access_token= config('access_token')
access_token_secret= config('access_token_secret')

auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

def tweetSearch(query, limit):
    """
    This function will search a query provided in the twitter and,
    retun a list of all tweets that have a query. 
    """

    # Create a blank variable
    tweets = []

    # Iterate through Twitter using Tweepy to find our query with our defined limit
    for tweet in tweepy.Cursor(api.search, q=query, tweet_mode='extended').items(limit):
        tweets.append(tweet)

    return tweets


def tweets_to_data_frame(tweets):
    """
    This function will receive tweets and collect specific data from it such as place, tweet's text,likes 
    retweets and save them into a pandas data frame.
    
    This function will return a pandas data frame that contains data from twitter.
    """
    df = pd.DataFrame(data=[tweet.full_text.encode('utf-8') for tweet in tweets], columns=["Tweets"])

    df["id"] = np.array([tweet.id for tweet in tweets])
    df["lens"] = np.array([len(tweet.full_text) for tweet in tweets])
    df["date"] = np.array([tweet.created_at for tweet in tweets])
    df["place"] = np.array([tweet.place for tweet in tweets])
    df["coordinateS"] = np.array([tweet.coordinates for tweet in tweets])
    df["lang"] = np.array([tweet.lang for tweet in tweets])
    df["source"] = np.array([tweet.source for tweet in tweets])
    df["likes"] = np.array([tweet.favorite_count for tweet in tweets])
    df["retweets"] = np.array([tweet.retweet_count for tweet in tweets])

    return df

# add hashtags in the following list
hashtags = [
'#kilimall', 
]

In [None]:
total_tweets = 0

"""
The following for loop will collect a tweets that have the hashtags
 mentioned in the list and save the tweets into csv file
"""

for n in tqdm(hashtags):
    # first we fetch all tweets that have specific hashtag
    tweets = tweetSearch(n, 100)
    # then we convert the tweets into pandas data frame
    total_tweets += int(len(tweets))
    df = tweets_to_data_frame(tweets)
    # then we save the data frame into csv file
    df.to_csv("data/{}.csv".format(n), index=False)