In [1]:
import pandas as pd
import requests
import json
import os
import string
import nltk
import re

from sklearn.feature_extraction.text import CountVectorizer

from dotenv import dotenv_values

from nltk.corpus import stopwords
nltk.download('stopwords')

from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\35383\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import warnings
warnings.filterwarnings('ignore') # We can suppress the warnings

In [3]:
config = dotenv_values(".env")

my_api_key = config["API_KEY"]
my_api_key_secret = config["API_KEY_SECRET"]

In [4]:
bearer_token = config["BEARER_TOKEN"]

search_url = "https://api.twitter.com/2/tweets/search/recent"


def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2RecentSearchPython"
    return r

def connect_to_endpoint(url, params):
    response = requests.get(url, auth=bearer_oauth, params=params)
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()


def Get_tweets(filename, params):
    
    filename = f'{filename}.bz2'
    
    if os.path.exists(filename) == False:
        
        for p in params:
            query_params = {
                'query' : f'{p} -is:retweet lang:en',
                'tweet.fields': 'author_id', 
                'user.fields': 'name',
                "max_results":"100",
            }


            json_response = connect_to_endpoint(search_url, query_params)

            #creating df 
            t = pd.DataFrame.from_dict(json_response['data'])
            
            if p == params[0]:
                df = t
            else:
                df = pd.concat([df, t])
                
                
            df.to_csv(filename, index=False,compression='bz2')
       
        print(f"Data from API save as {filename}")
    
    else:
        df = pd.read_csv(filename)
        print(f"The file {filename} was read.")
        
    return df

## Getting Tweets

In [5]:
p_list = ['Agriculture Ireland','Agriculture France', 'Agriculture Europe', 'sheep Ireland', 'goats Ireland' ,
          'sheep France', 'goats France', 'sheep Europe', 'goats Europe']

tweets_total = Get_tweets(filename = 'tweets_total',
                    params = p_list)

tweets_total.shape

The file tweets_total.bz2 was read.


(463, 4)

In [6]:
tweets_total.columns

Index(['text', 'id', 'edit_history_tweet_ids', 'author_id'], dtype='object')

In [7]:
def Clean_word(df_column):
    # Store the stopwords into the object named as "stop_words"
    stop_words = stopwords.words('english')

    # Store the string.punctuation into an object punct
    punct = string.punctuation

    # Initialise an object using a method PorterStemmer
    stemmer = PorterStemmer()

    cleaned_data=[]

    # For loop from first value to length(X), ^a-zA-Z means include small and capital case letters

    for i in range(len(df_column)):
        df = re.sub('[^a-zA-Z]', ' ', df_column.iloc[i])
        df = df.lower().split()
        df = [stemmer.stem(word) for word in df if (word not in stop_words) and (word not in punct)]
        df = ' '.join(df)
        cleaned_data.append(df)
        
    return cleaned_data

In [8]:
tweets_total['Text_clean'] = Clean_word(tweets_total['text'])
tweets_total.head()

Unnamed: 0,text,id,edit_history_tweet_ids,author_id,Text_clean
0,Get to grips with Multifunctional Agriculture ...,1612443217391149056,['1612443217391149056'],1544699978920124417,get grip multifunct agricultur read paper link...
1,Articles from the @irishexaminer on the fishin...,1612439553406640129,['1612439553406640129'],3791827456,articl irishexamin fish commun ireland depress...
2,Domestic Milk Intake up 7.4% in November 2022\...,1612404068755951620,['1612404068755951620'],63576239,domest milk intak novemb http co djc rvhm csoi...
3,@ed_sligo European Commission approves the CAP...,1612397532981641217,['1612397532981641217'],2461039562,ed sligo european commiss approv cap strateg p...
4,@johnredwood Northern Ireland Trawler Fleet st...,1612392384095993858,['1612392384095993858'],952852258865152000,johnredwood northern ireland trawler fleet sti...


### import base airlines

In [9]:
#import base airlines

t_air = pd.read_csv('tweets.csv')
t_air.head(3)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)


In [10]:
t_air = t_air[['text', 'airline_sentiment']]
t_air['Text_clean'] = Clean_word(t_air['text'])
t_air.head(4)

Unnamed: 0,text,airline_sentiment,Text_clean
0,@VirginAmerica What @dhepburn said.,neutral,virginamerica dhepburn said
1,@VirginAmerica plus you've added commercials t...,positive,virginamerica plu ad commerci experi tacki
2,@VirginAmerica I didn't today... Must mean I n...,neutral,virginamerica today must mean need take anoth ...
3,@VirginAmerica it's really aggressive to blast...,negative,virginamerica realli aggress blast obnoxi ente...


In [11]:
t_air.shape

(14640, 3)

### creating a unique database 

In [12]:
tweets_total.sample()

Unnamed: 0,text,id,edit_history_tweet_ids,author_id,Text_clean
10,@rayofoghlu I love these short videos from all...,1611777802444873729,['1611777802444873729'],1070801212272971782,rayofoghlu love short video around ireland loo...


In [13]:
ml = pd.concat([t_air, tweets_total[['Text_clean','text']]])
ml = ml.fillna(0).reset_index(drop =True)
ml.sample()

Unnamed: 0,text,airline_sentiment,Text_clean
4821,@SouthwestAir may I have my Companion pass ple...,neutral,southwestair may companion pass pleas


14640 - first row is from airline

463 - last row is from Twitter

In [14]:
# Instantiate an object cv by calling a method named as CountVectorzer()
cv    = CountVectorizer(max_features = 5000, stop_words = ['virginamerica', 'unit', 'amp'])

X = cv.fit_transform(ml['Text_clean']).toarray()
X.shape

(15103, 5000)

In [15]:
#splitting dataframes 

X_air =  X[:14640]
X_twi = X[14640:]
print(X_air.shape)
print(X_twi.shape)

(14640, 5000)
(463, 5000)


In [16]:
y = ml['airline_sentiment']
y.shape

(15103,)

In [17]:
#splitting dataframes 

y_air =  y[:14640]
y_twi = y[14640:]
print(y_air.shape)
print(y_twi.shape)

(14640,)
(463,)


### Machine Learning

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [19]:
# Split the dataset into training and testing parts
X_train, X_test, y_train, y_test = train_test_split(X_air, y_air, test_size = 0.3)

In [20]:
%%time

print('\nMultinomialNB:')
mnb = MultinomialNB()
mnb.fit(X_train,y_train)
y_pred = mnb.predict(X_test)
print(classification_report(y_test, y_pred))

print('\nLinear Support Vector Classifier:')
lsvc = LinearSVC()
lsvc.fit(X_train,y_train)
y_pred = lsvc.predict(X_test)
print(classification_report(y_test, y_pred))

print('\nGaussuan Naive Bayes:')
gnb = GaussianNB()
gnb.fit(X_train,y_train)
y_pred = gnb.predict(X_test)
print(classification_report(y_test, y_pred))

print('\nLogistic Regression:')
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))


print('\nRandom Forest Classifier:')
rfc = RandomForestClassifier(n_estimators = 10, random_state = 0)
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
print(classification_report(y_test, y_pred))



MultinomialNB:
              precision    recall  f1-score   support

    negative       0.80      0.90      0.85      2716
     neutral       0.60      0.45      0.51       964
    positive       0.73      0.62      0.67       712

    accuracy                           0.76      4392
   macro avg       0.71      0.65      0.67      4392
weighted avg       0.74      0.76      0.74      4392


Linear Support Vector Classifier:
              precision    recall  f1-score   support

    negative       0.83      0.85      0.84      2716
     neutral       0.58      0.55      0.56       964
    positive       0.68      0.67      0.68       712

    accuracy                           0.76      4392
   macro avg       0.70      0.69      0.69      4392
weighted avg       0.75      0.76      0.75      4392


Gaussuan Naive Bayes:
              precision    recall  f1-score   support

    negative       0.80      0.43      0.56      2716
     neutral       0.31      0.31      0.31       964
 

In [21]:

#Prediction MNB
tweets_total['MNB'] = mnb.predict(X_twi)

#Prediction Logistic Regression
tweets_total['LR'] = lr.predict(X_twi)

tweets_total.sample(10)

Unnamed: 0,text,id,edit_history_tweet_ids,author_id,Text_clean,MNB,LR
172,@Noahpinion Way more forests and wildlife in f...,1611376649572089863,['1611376649572089863'],1046907763337584640,noahpinion way forest wildlif first wave devel...,neutral,positive
189,@consoleghana @RAahiagbah @MBawumia @NPP_GH \n...,1611154672202440704,['1611154672202440704'],1131380694,consoleghana raahiagbah mbawumia npp gh prez n...,neutral,neutral
11,👇The first constructive discussion I've heard ...,1611769649049505794,['1611769649049505794'],3029057830,first construct discuss heard ireland agricult...,neutral,positive
328,@nameshiv the one I loved the most was Dominic...,1611541228218355712,['1611541228218355712'],829410324365328384,nameshiv one love dominican republ music franc...,positive,positive
112,TODAY: The Inhumane Society #AnimalWelfare #Fa...,1610329189818982401,['1610329189818982401'],157075236,today inhuman societi animalwelfar factoryfarm...,neutral,neutral
230,i consider antifa in ireland TRAITERS to their...,1611899119102038016,['1611899119102038016'],1608531910644436996,consid antifa ireland traiter countri mindless...,neutral,negative
25,@BlaneyCarola @coilltenews @pippa_hackett @Eam...,1611378324483510272,['1611378324483510272'],843923118213206017,blaneycarola coilltenew pippa hackett eamonrya...,neutral,negative
251,This is why Ireland is so beautiful on so many...,1611055627496734724,['1611055627496734724'],1350482494190211073,ireland beauti mani level expect tractor sheep...,neutral,neutral
270,@bracken_bill @farmersjournal To have guardian...,1610272623505858563,['1610272623505858563'],4850453439,bracken bill farmersjourn guardian dog flock c...,positive,neutral
300,@93vintagejones How many sheep does France have,1611407491123789833,['1611407491123789833'],1221520071987736578,vintagejon mani sheep franc,neutral,neutral


In [22]:
i = 31

print(f"Text: {tweets_total['text'].iloc[i]}")
print(f'\nMNB: {tweets_total["MNB"].iloc[i]}')
print(f'LR: {tweets_total["LR"].iloc[i]}')

Text: @Sav70 @McConalogue @martinheydonfg @IrlEmbChina @Bordbia @agriculture_ie @farmersjournal @AgrilandIreland @thatsfarming You have only referenced agri imports &amp; exports with China, you don't seem to have an issue with other goods going to &amp; from there &amp; their emissions. I doubt the phone you are using is Irish made. Growing apples in Ireland doesn't pay, that's why. Everyone wants cheap food.

MNB: positive
LR: negative
