# Tweeter Basic Sentiment Analysis

In [1]:
import json
import re
import pandas as pd
from google_trans_new import google_translator  # pip install google_trans_new

In this project I will perform sentiment analysis on a list of tweets using just json, pandas, regex and a google translate.

We have two files, one file 'tweets.txt' with all raw data in json format from tweeter, and another file 'Sentimientos.txt' with a list of words and their associated sentiment value, some have a positive value and others have a negative value, these values are supposed to give us information about the tweets general sentiments.

The goal is to be able to check the sentiment value of every tweet.

### Data Preprocessing

In [2]:
# populating the dictionary values with information from the file Sentimientos.txt

sentiments = open("Sentimientos.txt") 
values = {} 
for line in sentiments: 
    term, value = line.split("\t") 
    values[term] = int(value)

In [3]:
# Populating the data list with all the information about the tweets given in tweets.txt

data = []
with open('tweets.txt') as f:
    for line in f:
        data.append(json.loads(line)) 

In [4]:
# Withdrawing texts and ids from the tweets and putting them in a tweets list and a ids list

tweets = [] 
ids = [] 
for i in range(len(data)):
    if 'text' in data[i]:
        tweets.append(data[i]['text'])
        ids.append(data[i]['id'])
    else:
        continue

### Tweets Translation

In [5]:
def google_translate(string):
    '''This function translates to English any given string'''
    translator = google_translator()
    trans = translator.translate(string, lang_tgt = 'en')
    return trans

# This loop translates all the tweets to English into a list called translations

translations = []

for i in range(len(tweets)):
    translations.append(str(google_translate(tweets[i]))) 

### Assigning sentiment values to tweets

In [6]:
# This comprehension list gives every word that are present in the values list its value, and it gives a 0 value
#to words that are not in the list:
    
sentiment = [
    sum(
        values[word] if word in values else 0 
        for word in re.split('[^a-z]+', translation.lower()) # The comprehension lists use regex to separate 
        #words from any punctuation sign or number
    ) 
    for translation in translations
] 

### Data Display

In [7]:
# Creating the data frame:

# Set pandas to display all rows
pd.set_option('display.max_rows', None, 'display.max_columns', 2)

# intialise data of lists. 
d = {'Id': ids, 'Tweet':tweets,'Translation' : translations, 'Sentiment': sentiment}
  
# Create DataFrame 
df = pd.DataFrame(d) 

#Setting the index to the column 'Id'
df.set_index('Id', inplace = True)

df

Unnamed: 0_level_0,Tweet,...,Sentiment
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
509748524897292288,@Brenamae_ I WHALE SLAP YOUR FIN AND TELL YOU ...,...,0
509748529070616576,Metin Şentürk Twitterda @metinsenturk MUHTEŞEM...,...,0
509748529095774208,RT @byunghns: 😭 I LOVE #틴탑 SO MUCH #쉽지않아 IS GO...,...,6
509748529104175104,que hdp maicon lo que le hizo a david luiz jaj...,...,-4
509748529107988480,ドライ！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！,...,0
509748529074814976,RT @Positivamos: tudo o que eu quero é um amor...,...,6
509748529074827266,RT @GossipRoomOff: Si Nathalie sait pertinemme...,...,1
509748529083211776,RT @Dayannalozano_: Junior va perdiendo... htt...,...,-3
509748529074823170,RT @Positivamos: tudo o que eu quero é um amor...,...,6
509748529087397889,RT @Positivamos: tudo o que eu quero é um amor...,...,6


In [8]:
def tweet_information(index):
    '''This function accepts the tweet id and returns the tweet, the sentiment value
    and translation if the language of the tweet is in any language other than English'''
    detect_result = google_translator().detect(df.Tweet.loc[index])  
    if detect_result[1] != 'english': # If the detected language is other than English the function returns the 
        #following:
        return print(
        'Tweet: \'{}\' \n\nSentiment value: {}.\n\nTranslation: \'{}\'\n\nTranslated from {}'.format(
        df.Tweet.loc[index], df.Sentiment.loc[index], # The function withdraws information from the pandas data 
            #frame
        df.Translation.loc[index], detect_result[1].title() # The detect_result functions returns two outputs 
            #like this ('ja', 'japanese'), it returns the second item capitalized
        )) 
    else:
        return print('Tweet: \'{}\' \n\nSentiment value: {}'.format
                     (df.Tweet.loc[index], df.Sentiment.loc[index])
                    ) 

In [9]:
# Checking tweets:

tweet_information(509748529074417666)

Tweet: 'ドラコ・マルフォイ　「闇の帝王が支配なさる時、OWLやNEWTが何科目なんて『あの人』が気になさるか？もちろん、そんなことは問題じゃない・・・『あの人』のためにどのように奉仕し、どのような献身ぶりを示して来たかだけが重要だ」' 

Sentiment value: 8.

Translation: 'Draco Malfoy "When the Dark Emperor rules, how many subjects do OWL and NEWT care about" that person "? Of course, that doesn't matter ... how for" that person "? Only what you have served and how devoted you have been is important. " '

Translated from Japanese


In [10]:
tweet_information(509748529104191488)

Tweet: 'الله أكبر الله أكبر الله أكبر, لا إله إلا الله, الله أكبر الله اكبر ولله الحمد http://t.co/M454OhQpYY' 

Sentiment value: 25.

Translation: 'God is great God is great God is great, there is no god but God, God is great God is great, praise be to God http://t.co/M454OhQpYY '

Translated from Arabic


In [11]:
tweet_information(509748537471827968)

Tweet: 'RT @maamsalcatraz: Cette gomme la plus grosse arnaque du siècle elle était sensé gommé ls stylo ,mais elle déchirer la feuille cet pute htt…' 

Sentiment value: -7.

Translation: 'RT @maamsalcatraz: This eraser the biggest scam of the century she was supposed to erase the pen, but she tear the sheet this bitch htt… '

Translated from French
