In [1]:
import pandas as pd
import numpy as np
import re
import csv
import sys
from nltk.stem.porter import PorterStemmer
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from scipy.special import softmax
import urllib.request


In [2]:
data = pd.read_table("Task-1 tweets_1000.csv",header=None)

In [3]:
words=[]
for i in data[0]:
    words.append(i)

In [4]:
#Data Cleaning
use_stemmer =True
def preprocess_word(word):
    # Remove punctuation
    word = word.strip('\'"?!,.():;')
    # Convert more than 2 letter repetitions to 2 letter
    # funnnnny --> funny
    word = re.sub(r'(.)\1+', r'\1\1', word)
    # Remove - & '
    word = re.sub(r'(-|\')', '', word)
    return word


def is_valid_word(word):
    # Check if word begins with an alphabet
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)


def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
    return tweet


def preprocess_tweet(tweet):
    processed_tweet = []
    # Convert to lower case
    tweet = tweet.lower()
    # Replaces URLs with the word URL
    tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
    # Replace @handle with the word USER_MENTION
    tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
    # Replaces #hashtag with hashtag
    tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
    # Remove RT (retweet)
    tweet = re.sub(r'\brt\b', '', tweet)
    # Replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    # Strip space, " and ' from tweet
    tweet = tweet.strip(' "\'')
    # Replace emojis with either EMO_POS or EMO_NEG
    tweet = handle_emojis(tweet)
    # Replace multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet)
    words = tweet.split()

    for word in words:
        word = preprocess_word(word)
        if is_valid_word(word):
            word = str(porter_stemmer.stem(word))
            processed_tweet.append(word)
    
    return ' '.join(processed_tweet)

porter_stemmer = PorterStemmer()

In [5]:
processed_tweets=[]
for tweet in words:
    if preprocess_tweet(tweet) is None:
        x="NaN"
    else:
        x=preprocess_tweet(tweet)
    processed_tweets.append(x)

In [6]:
#RoBERTa Model
roberta ="cardiffnlp/twitter-roberta-base-sentiment"
model=AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)

In [7]:
labels = ['Negative', 'Neutral', 'Positive']
sentiments=[]

In [9]:
# sentiment analysis 
def sentiment(tweet):
        error=processed_tweets[942]
        if(tweet!=error):
            encoded_tweet = tokenizer(tweet, return_tensors='pt')
# output = model(encoded_tweet['input_ids'], encoded_tweet['attention_mask'])
            output = model(**encoded_tweet)
            scores = output[0][0].detach().numpy()
            scores = softmax(scores)
            max_score=max(scores)
            for i in range(len(scores)):
                if(scores[i]==max_score):
                    return  labels[i]
        else:
            return 'Negative'

In [10]:
for word in processed_tweets:
    sentiments.append(sentiment(word))

In [11]:
d={'Tweets':processed_tweets,'Label':sentiments}

In [12]:
df=pd.DataFrame(d)

In [13]:
df.head(10)

Unnamed: 0,Tweets,Label
0,suck at social distanc,Negative
1,jasmin strang share a messag of hope dure thi ...,Neutral
2,i gotta fight these allergi in public to make ...,Negative
3,url on easter pleas rememb the poor and desol ...,Neutral
4,user_ment i have a cute one made from recycl s...,Positive
5,told my mom we should start to work from home ...,Positive
6,,Neutral
7,user_ment user_ment user_ment user_ment user_m...,Negative
8,we are all in deep doo doo,Neutral
9,to be honest everyon wa scare of coronaviru an...,Negative


In [15]:
from pathlib import Path  
filepath = Path('/Users/jaide/Desktop/RoBERTa_output.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df.to_csv(filepath)