## Loading model and predicting


In [2]:
!pip install transformers --quiet

In [3]:
!pip install python-dotenv



In [4]:
!pip install textblob



In [31]:
# loading all requirements
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import pandas as pd
import time
from transformers import (AutoTokenizer, AutoModel, 
                          AutoModelForSequenceClassification, 
                          DataCollatorWithPadding, AdamW, get_scheduler,
                          get_linear_schedule_with_warmup,
                          )
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import random
import numpy as np
import re
import tweepy
import pandas as pd
import numpy as np
import string 
from dotenv import dotenv_values
from flask import Flask, config, render_template, request,redirect,url_for,send_file
import io
import base64
from matplotlib.figure import Figure
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from io import StringIO
from pathlib import Path

In [6]:
# Setting up seed value
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
#init tokenizer
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
# load model
def load_model():
    checkpoint = "distilbert-base-uncased"
    PATH = "toxic_distilBERT_multilabel.pt"
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 6)
    model.load_state_dict(torch.load(PATH, map_location=torch.device('cpu')))
    return model

In [10]:
#read tweets csv file. It as 2 columns -> tweets and cleaned_tweets
def tokenize_csv(tweets):
    #tweets = pd.read_csv(tweets_df)
    # tokenize and encode sequences in the actual test set
    sub_tokens = tokenizer.batch_encode_plus(tweets["cleaned_tweets"].tolist(),
                                         max_length = 200,
                                         pad_to_max_length=True,
                                         truncation=True,
                                         return_token_type_ids=False
                                         )
    sub_seq = torch.tensor(sub_tokens['input_ids'])
    sub_mask = torch.tensor(sub_tokens['attention_mask'])
    sub_data = TensorDataset(sub_seq, sub_mask)
    batch_size = 32
    sub_dataloader = DataLoader(sub_data,batch_size=batch_size)
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    return sub_dataloader

In [11]:
#predict 
# Measure how long the evaluation going to takes.
categories = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
def predict_final(sub_dataloader,model):
    t0 = time.time()
    for step, batch in enumerate(sub_dataloader):
        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            pass
            # Calculate elapsed time in minutes
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        
        with torch.no_grad():
            outputs = model(b_input_ids, b_input_mask)
            pred_probs = torch.sigmoid(outputs.logits)
            if step == 0:
                predictions = pred_probs.cpu().detach().numpy()
            else:
                predictions = np.append(predictions, pred_probs.cpu().detach().numpy(), axis=0)
    
    predictions_df = pd.DataFrame(predictions, columns = categories)
    return predictions_df

In [12]:
def calculations(predictions_df):
    
    # creating new columns for different cateogries. it will have % values
    for name in categories:
        predictions_df[name+'_calc'] = predictions_df[name] * 100
    
    threshold = 4
    
    toxics = predictions_df[predictions_df['toxic_calc']>threshold]['toxic_calc'].count() 
    severe_toxic = predictions_df[predictions_df['severe_toxic_calc']>threshold]['severe_toxic_calc'].count()
    obscenes = predictions_df[predictions_df['obscene_calc']>threshold]['obscene_calc'].count()
    threats = predictions_df[predictions_df['threat_calc']>threshold]['threat_calc'].count()
    insults = predictions_df[predictions_df['insult_calc']>threshold]['insult_calc'].count()
    identity_hates = predictions_df[predictions_df['identity_hate_calc']>threshold]['identity_hate_calc'].count()
    
    identified_num = [toxics]
    identified_num.append(severe_toxic)
    identified_num.append(obscenes)
    identified_num.append(threats)
    identified_num.append(insults)
    identified_num.append(identity_hates)
    
    
    #print(toxics)
    #print(predictions_df['toxic'].count())
    toxics_perc = toxics/predictions_df['toxic'].count() * 100
    severe_toxic_perc = severe_toxic/predictions_df['severe_toxic'].count() * 100
    obscene_perc = obscenes/predictions_df['obscene'].count() * 100
    threat_perc = threats/predictions_df['threat'].count() * 100
    insult_perc = insults/predictions_df['insult'].count() * 100
    identity_hate_perc = identity_hates/predictions_df['identity_hate'].count() * 100
    
    identified_perc = [toxics_perc]
    identified_perc.append(severe_toxic_perc)
    identified_perc.append(obscene_perc)
    identified_perc.append(threat_perc)
    identified_perc.append(insult_perc)
    identified_perc.append(identity_hate_perc)
    
    return predictions_df['toxic'].count(), identified_num, identified_perc

##  grab tweets using tweepy

In [13]:
def auth(config):
    auth = tweepy.OAuthHandler(config['CONSUMER_KEY'],config['CONSUMER_SECRET'])
    auth.set_access_token(config['ACCESS_KEY'],config['ACCESS_SECRET'])
    auth.secret = True
    global api
    api = tweepy.API(auth,wait_on_rate_limit=True)
    return api


In [14]:
def clean(text):
        text  = "".join([char for char in text if char not in string.punctuation])
        text = re.sub(r"http\S+", "", text)
        text = re.sub(r"www.\S+", "", text)
        text = re.sub(r'RT[\s]+','',text)
        text = re.sub('[0-9]+', '', text)
        return text

In [15]:
def read_tweets(df):
        df['cleaned_tweets'] = df['tweets'].apply(lambda x: clean(str(x)))
        return df

In [46]:

def grab_tweets(user):
    tweetsPerQry = 100
    fName = 'static/files/'+user+'.txt'
    sinceId = None
    max_id = -1
    maxTweets = 1000
    tweetCount = 0
    output = []
    print("Downloading max {0} tweets",format(maxTweets))
    with open(fName,'w') as f:
        while tweetCount < maxTweets:
            try:
                if(max_id<=0):
                    if(not sinceId):
                        new_tweets = api.user_timeline(screen_name=user,lang='en',count=tweetsPerQry, tweet_mode ='extended')
                    else:
                        new_tweets = api.user_timeline(screen_name=user,lang='en',count=tweetsPerQry,since_id=sinceId, tweet_mode ='extended')
                else:
                    if(not sinceId):
                        new_tweets = api.user_timeline(screen_name=user,lang='en',count=tweetsPerQry,max_id = str(max_id-1), tweet_mode ='extended')
                    else:
                        new_tweets = api.user_timeline(screen_name=user,lang='en',count=tweetsPerQry,max_id = str(max_id-1),since_id=sinceId, tweet_mode ='extended')
    
                if not new_tweets:
                    print("No more tweets found")
                    break
                for tweet in new_tweets:
                    output.append(tweet.full_text.replace('\n','').encode("utf-8"))
                    f.write(str(tweet.full_text.replace('\n','').encode("utf-8"))+"\n")

                tweetCount += len(new_tweets)
                #print("Dowloaded {0} tweets".format(tweetCount))
                max_id=new_tweets[-1].id
            except tweepy.TweepError as e:
                print('some error: '+str(e))
                break
    df = pd.DataFrame(output,columns=['tweets'])  
    return read_tweets(df)

In [None]:
config = dotenv_values('.env')
auth(config)
tweet_df = grab_tweets(userName)

## flask server

In [None]:
'''
Flow of program
Run every cell first -> init torch -> load_model -> auth tweepy -> grab_tweets -> read_tweets -> clean_tweets 
tokenize_csv -> predict_final -> calculations 

Flask server should be running in this only 
Server start -> render home page -> get userName -> pass it to grab_tweets ->{next steps as above} -> pass back 
calcuations output to chart on front page

'''

In [18]:
# loading model. Tokenizer already fired up
model = load_model()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [44]:
def polarity():
    polarity=lambda x:TextBlob(x).sentiment.polarity
    subjectivity = lambda x:TextBlob(x).sentiment.subjectivity

    tweet_polarity = np.zeros(len(tweet_df['cleaned_tweets']))
    tweet_subjectivity = np.zeros(len(tweet_df['cleaned_tweets']))

    for idx, tweet in enumerate(tweet_df['cleaned_tweets']):
        tweet_polarity[idx] = polarity(tweet)
        tweet_subjectivity[idx] = subjectivity(tweet)
    
    
    plt.figure(figsize=(12,6))
    sns.scatterplot(tweet_polarity, #x-axis
                tweet_subjectivity, #y-axis
                s=100)
    
    plt.title('sentimental Analysis', fontsize=20)
    plt.xlabel('<-Negative- - - - - - - - - - - - - - - - - - - - - - - - - - - -Positive->',fontsize=15)
    plt.ylabel('<-Facts- - - - - - - - - - - - - - - - -opinion->',fontsize=15)
    plt.tight_layout()
    plt.plot()
    
    #save_file
    url1 = 'static/images/'+name+'scatter.png'
    my_file = Path(url1)
    if my_file.exists():
        pass
    else:
        plt.savefig(url1)
    plt.close()
    # second plot
    f, axs = plt.subplots(1, 2, figsize=(20,8))

    sns.distplot(tweet_polarity, color="b", ax=axs[0])
    axs[0].set_title("Tweet Polarity", fontsize = 20)
    axs[0].set_xlabel('← Negative - - - - - - - - - - - - - - - - - - Positive →', fontsize=15)
    sns.distplot(tweet_subjectivity, color="b", ax=axs[1])
    axs[1].set_title("Tweet Subjectivity", fontsize = 20)
    axs[1].set_xlabel('← Facts - - - -  - - - - - - - - - - - - - - Opinions →', fontsize=15)
    plt.tight_layout()
    plt.plot()
    url2 = 'static/images/'+name+'dist.png'
    my_file2 = Path(url2)
    if my_file2.exists():
        pass
    else:
        plt.savefig(url2)
    plt.close()
    
    return url1,url2 

In [45]:
app = Flask(__name__)


@app.route('/',methods=['POST','GET'])
def home():
    if request.method=='POST':
        global name
        name = request.form['userName'].strip()
        global tweet_df
        tweet_df = grab_tweets(name) # will return a dataframe which has 2 columns named tweets and cleaned_tweets
        #Now we have to tokenize it and predict
        sub_dataloader = tokenize_csv(tweet_df)
        predictions_df= predict_final(sub_dataloader,model)
        length,identified_num, identified_perc = calculations(predictions_df)
        
        g1,g2 = polarity()
        fName = 'static/images/'+name+'.txt'
        print(g1,g2)
        categories = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
        return render_template('predict.html',userName=name,
                               nums = identified_num, perc=identified_perc, 
                               length =length, labels = categories, 
                               url1=g1, url2 = g2,fName = fName)
    
    else:
        return render_template('home.html')
        
@app.route('/predict')
def predict():
    return render_template('predict.html')




if __name__ == '__main__':
    config = dotenv_values('.env')
    auth(config)
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [15/Oct/2021 17:03:36] "[37mGET / HTTP/1.1[0m" 200 -
Unexpected parameter: lang


Downloading max {0} tweets 1000


Unexpected parameter: lang
Unexpected parameter: lang
Unexpected parameter: lang
Unexpected parameter: lang
Unexpected parameter: lang
Unexpected parameter: lang
Unexpected parameter: lang
Unexpected parameter: lang
Unexpected parameter: lang
Unexpected parameter: lang
127.0.0.1 - - [15/Oct/2021 17:09:34] "[37mPOST / HTTP/1.1[0m" 200 -


static/images/charliekirk11scatter.png static/images/charliekirk11dist.png


In [34]:
my_file = Path("static/images/charliekirk11scattjjer.png")

In [35]:
if my_file.exists():
    print("already")
else:
    print("not")

not


In [None]:
#scrap code
# firing up torch 
initialize_torch()
load_model()
sub_dataloader = tokenize_csv(tweet_df)
predictions_df= predict_final(sub_dataloader)
calculations(predictions_df)

In [None]:
#sample accounts
'''
RyanAFournier
charliekirk11
TheHRH

imillhiser
owillis
'''