## Initial Setup

In [47]:
import pandas as pd 
import snscrape.modules.twitter as sntwitter
from googletrans import Translator
from langdetect import detect
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import re
import yaml

In [48]:
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

### Loading the NLP Model

In [49]:
# Loading the Model 
roberta = "cardiffnlp/twitter-roberta-base-sentiment-latest"

model = AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)

Translator = Translator() # Initialization Translator Object


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Scrapping the Twitter Data

In [50]:
def translateSentence(tweet):
    try:
        languageDetected = detect(tweet)
    except:
        return "" 
    
    if languageDetected != 'en':
        tweet = Translator.translate(tweet, dest="en").text
    
    return tweet

In [51]:
def preprocessTweet(tweet):
    
    tweet = re.sub(r'@[A-Za-z0-9]+', '', tweet) # Removes the Tags from the Tweet (@ mentions)
    tweet = re.sub(r'#', '', tweet) # Removes any hashtags 
    tweet = re.sub(r'RT[\s]+', '', tweet) # Removes the RTs.
    tweet = re.sub(r'https?:\/\/\S+', '', tweet) # Removing the hyperlinks 
    
    if len(tweet) > 10:
        return tweet


In [52]:
# Defining Labels 
labels = ["Negative", "Neutral", "Positive"]

In [53]:
def getPrediction(tweet):
    encoded_tweet = tokenizer(tweet, return_tensors='pt')
    output = model(**encoded_tweet)
    
    scores = output[0][0].detach().tolist()
    scores = softmax(scores) # Get Probabilities against Logits
    scores_dict = {
        "Negative": scores[0], 
        "Neutral": scores[1], 
        "Positive": scores[2]
    }

    return max(scores_dict, key=scores_dict.get), max(scores_dict.values()) # Return the Label and Score with Maximum Probability

In [98]:
# Enter Twitter Handles to Scrap Data
peopleToScrap = ["ImranKhanPTI", "HamidMirPAK", "BBhuttoZardari", "CMShehbaz", "Asad_Umar"]

In [99]:
def scrap_twitter_data(person):
    scrapper = sntwitter.TwitterSearchScraper("from:"+person)
    tweets = []
    for i, tweet in enumerate(scrapper.get_items()):

        tweetContent = tweet.rawContent
        tweetContent = translateSentence(tweetContent)
        tweetContent = preprocessTweet(tweetContent)
        if not tweetContent:
            continue
        polarity, score = getPrediction(tweetContent)
        data = [
            tweet.date, 
            tweet.user.displayname,
            tweet.user.username,
            tweet.rawContent, 
            tweetContent,
            polarity, 
            score
        ]

        tweets.append(data)
        if i > 100:
            break
    return tweets

In [100]:
### RUN THIS FOR MULTIPROCESSING 
# completeData = []
# with concurrent.futures.ProcessPoolExecutor() as executor: 
    
#     results = executor.map(scrap_twitter_data, peopleToScrap)
    
#     for result in results:
#         completeData.extend(result)


In [101]:
completeData = [] 

for person in peopleToScrap:
    
    completeData.extend(scrap_twitter_data(person))
    print("Data Scrapped Successfully against", person)

Data Scrapped Successfully against ImranKhanPTI
Data Scrapped Successfully against HamidMirPAK
Data Scrapped Successfully against BBhuttoZardari
Data Scrapped Successfully against CMShehbaz
Data Scrapped Successfully against Asad_Umar


In [102]:
tweet_df = pd.DataFrame(
    completeData, columns=['TWEET_DATE', 'NAME', 'USERNAME', 'CONTENT', 'NORMALIZED_TWEET', 'POLARITY', 'SCORE'])

In [103]:
def fix_date_cols(df, tz = 'UTC'):
    df["TWEET_DATE"] = df["TWEET_DATE"].dt.tz_localize(tz)
    return df

In [104]:
tweet_df["TWEET_DATE"] = pd.to_datetime(tweet_df["TWEET_DATE"]).dt.strftime('%Y-%m-%d')
tweet_df['INDEX'] = range(1, len(tweet_df) + 1)
tweet_df

Unnamed: 0,TWEET_DATE,NAME,USERNAME,CONTENT,NORMALIZED_TWEET,POLARITY,SCORE,INDEX
0,2023-02-18,Imran Khan,ImranKhanPTI,کراچی پولیس آفس پر کل کے دہشتگرد حملے کی شدید ...,I strongly condemn yesterday's terrorist attac...,Negative,0.936052,1
1,2023-02-18,Imran Khan,ImranKhanPTI,Strongly condemn the terrorist attack on Karac...,Strongly condemn the terrorist attack on Karac...,Negative,0.927952,2
2,2023-02-13,Imran Khan,ImranKhanPTI,ضیاءمحیِّ الدین کی رحلت پر افسردہ ہوں۔میری ان ...,I am saddened by the death of Zia Mahi -ud -di...,Neutral,0.444117,3
3,2023-02-13,Imran Khan,ImranKhanPTI,Saddened to learn of the passing of Zia Mohyed...,Saddened to learn of the passing of Zia Mohyed...,Positive,0.539021,4
4,2023-02-11,Imran Khan,ImranKhanPTI,قانون کی حکمرانی کےبغیرجمہوریت کا تصور ہی محال...,The concept of democracy without the rule of l...,Neutral,0.774645,5
...,...,...,...,...,...,...,...,...
494,2022-12-31,Asad Umar,Asad_Umar,ووٹر موجود، الیکشن کمیشن اور بزدل امپورٹڈ حکوم...,"Voters present, Election Commission and coward...",Negative,0.606073,495
495,2022-12-31,Asad Umar,Asad_Umar,یہ امپورٹد حکومت جو نظام بنانے کی کوشش کر رہی ...,"This imported government, which is trying to c...",Negative,0.775931,496
496,2022-12-31,Asad Umar,Asad_Umar,تحریک انصاف اسلام آباد کی ٹیمیں الیکشن کے لئے ...,PTI Islamabad teams are present on the ground ...,Negative,0.634548,497
497,2022-12-30,Asad Umar,Asad_Umar,Another 294 million $ drop in state bank reser...,Another 294 million $ drop in state bank reser...,Negative,0.891747,498


In [105]:
tweet_df.to_csv("Scrapped_Data.csv", index=False)

## Snowflake Connectivity

In [106]:
import snowflake.connector
import yaml
from snowflake.connector.pandas_tools import write_pandas

In [107]:
conn = snowflake.connector.connect (
    user = config["SNOWFLAKE_USERNAME"], 
    password = config["SNOWFLAKE_PASSWORD"], 
    account = config["SNOWFLAKE_ACCOUNT"],
    database = config["SNOWFLAKE_DATABASE"], 
    schema = config["SNOWFLAKE_SCHEMA"],
    region = config["SNOWFLAKE_REGION"],
    Role = config["SNOWFLAKE_ROLE"],
    Warehouse = config["SNOWFLAKE_WAREHOUSE"]
)

In [108]:
# Delete all the existing records from the Snowflake Table
cs = conn.cursor()
try:
    cs.execute("delete from TWEETS_DATA")
finally:
    cs.close()

In [109]:
# Write the Pandas Dataframe into Snowflake Table 
success, nchunks, nrows, _ = write_pandas(conn, tweet_df, "TWEETS_DATA", quote_identifiers=True)

## Power BI Data Refresh 

In [110]:
# Video Reference: https://www.youtube.com/watch?v=kcOp3_RCMmo
import requests
import msal

def request_access_token():
    app_id = config['POWERBI_APP_ID']
    tenant_id = config['POWERBI_TENANT_ID']

    authority_url = 'https://login.microsoftonline.com/' + tenant_id
    scopes = ['https://analysis.windows.net/powerbi/api/.default']

    # Step 1. Generate Power BI Access Token
    client = msal.PublicClientApplication(app_id, authority=authority_url)
    token_response = client.acquire_token_by_username_password(username=config['POWERBI_USERNAME'], password=config['POWERBI_PASSWORD'], scopes=scopes)
    if not 'access_token' in token_response:
        raise Exception(token_response['error_description'])

    access_id = token_response.get('access_token')
    return access_id

access_id = request_access_token()

dataset_id = config['POWERBI_DATASET_ID']
endpoint = f'https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes'
headers = {
    'Authorization': f'Bearer ' + access_id
}

response = requests.post(endpoint, headers=headers)
if response.status_code == 202:
    print('Dataset refreshed')
else:
    print(response.reason)
    print(response.json())


Dataset refreshed


## Power BI Connectivity

In [111]:
from IPython.display import IFrame

In [112]:
link_report = "https://app.powerbi.com/reportEmbed?reportId=b0c6bf9b-3ec5-41ad-839e-83d6ac075010&autoAuth=true&ctid=a1e3cc4f-47e2-4e32-a7a1-5b14136b160b"
Dashboard = IFrame(src=link_report, width=950, height=600)

In [113]:
display(Dashboard)