# ISYE 6740 Project: Pipeline Setup

- Data overview
- Text cleaning
- Pipeline build and application

In [1]:
# Installations
!pip install spacy --quiet
!pip install spacymoji --quiet
!pip install spacy_transformers --quiet
!pip install spacy-huggingface-pipelines
!pip install LeXmo

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m421.5/421.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.9/197.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.0/314.0 kB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy-huggingface-pipelines
  Downloading spacy_huggingface_pipelines-0.0.4-py2.py3-none-any.whl (11 kB)
Installing collected packages: spacy-huggingface-pipelines
Successfully installed spacy-huggingface-pipelines-0.0.4
Collecting LeXmo
  Downloading LeXmo-0.1.4-py3-none-any.whl (6.9 kB)
Installing collected packages: LeXmo
Successfully installed LeXmo-0.1.4


In [2]:
# Imports
import os
import sys
import pandas as pd
import numpy as np
import re
import string
from spacy.matcher import Matcher
from spacy.lang.en.stop_words import STOP_WORDS
from spacy_transformers import Transformer
from spacy_transformers.pipeline_component import DEFAULT_CONFIG
from spacy import Language
import spacy
import spacymoji
from transformers import pipeline
from tqdm.notebook import tqdm_notebook
import math
import joblib
from collections import defaultdict
from LeXmo import LeXmo

# Secrets
from google.colab import userdata
userdata.get('HF_TOKEN')

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
# Import the data set
data = pd.read_csv('/content/drive/MyDrive/Georgia Tech OMSA/ISYE6740/Project/ev_Oct2022.csv')

# Check
data.head()

Unnamed: 0,id,conversation_id,created_at,date,timezone,place,tweet,language,hashtags,cashtags,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,1584552942006603778,1584552942006603778,1666622000000.0,2022-10-24 09:30:35,-500,,India gets a cheap scooter option #India #elec...,en,"['india', 'electricvehicles']",[],...,,,,,,[],,,,
1,1584552583435206656,1584552583435206656,1666622000000.0,2022-10-24 09:29:10,-500,,WHY LITHIUM? Demand for lithium is increas...,en,"['nwtt', 'nwttmining', 'miningbullies']",[],...,,,,,,[],,,,
2,1584552453588267010,1584537017827721216,1666622000000.0,2022-10-24 09:28:39,-500,,@newdougman There’s a need for vehicles to tra...,en,[],[],...,,,,,,"[{'screen_name': 'newdougman', 'name': 'Doug',...",,,,
3,1584551852561006592,1584551852561006592,1666622000000.0,2022-10-24 09:26:15,-500,,Electric vehicles are growing in popularity no...,en,[],[],...,,,,,,[],,,,
4,1584551769048227841,1584551769048227841,1666622000000.0,2022-10-24 09:25:55,-500,,Our live demo on the stage of Web Summit Lisbo...,en,"['livedemo', 'websummit', 'websummit2022', 'ev...",[],...,,,,,,[],,,,


## Data Cleaning

- Remove columns based on certain criteria:
  - Completely empty.
  - Low or no variance columns.
  - Columns with irrelevant data.
- Remove non-English tweets.
- Remove tweets with no content aside from hyperlinks or hashtags.

### Remove Columns Based on Certain Criteria

In [None]:
# Find emptiness
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36920 entries, 0 to 36919
Data columns (total 38 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               36920 non-null  int64  
 1   conversation_id  36920 non-null  int64  
 2   created_at       36920 non-null  float64
 3   date             36920 non-null  object 
 4   timezone         36920 non-null  int64  
 5   place            14 non-null     object 
 6   tweet            36920 non-null  object 
 7   language         36920 non-null  object 
 8   hashtags         36920 non-null  object 
 9   cashtags         36920 non-null  object 
 10  user_id          36920 non-null  int64  
 11  user_id_str      36920 non-null  int64  
 12  username         36920 non-null  object 
 13  name             36920 non-null  object 
 14  day              36920 non-null  int64  
 15  hour             36920 non-null  int64  
 16  link             36920 non-null  object 
 17  urls        

In [None]:
# Columns to drop
drop_cols = []

# Iterate columns in data set
for col in data.columns:
    # Drop completely empty columns
    if data[col].isnull().sum() == data.shape[0]:
        drop_cols.append(col)

# Drop columns
data.drop(drop_cols, axis=1, inplace=True)

# Check
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36920 entries, 0 to 36919
Data columns (total 28 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               36920 non-null  int64  
 1   conversation_id  36920 non-null  int64  
 2   created_at       36920 non-null  float64
 3   date             36920 non-null  object 
 4   timezone         36920 non-null  int64  
 5   place            14 non-null     object 
 6   tweet            36920 non-null  object 
 7   language         36920 non-null  object 
 8   hashtags         36920 non-null  object 
 9   cashtags         36920 non-null  object 
 10  user_id          36920 non-null  int64  
 11  user_id_str      36920 non-null  int64  
 12  username         36920 non-null  object 
 13  name             36920 non-null  object 
 14  day              36920 non-null  int64  
 15  hour             36920 non-null  int64  
 16  link             36920 non-null  object 
 17  urls        

In [None]:
# Low variance columns
low_var_cols = []

# Iterate columns
for col in data.columns:
    # Identify low variance columns
    if data[col].dtype == 'object':
      # Check if column is low variance
      if (data[col].nunique() == 1):
          low_var_cols.append(col)
    else:
      if data[col].var() < 0.1:
        low_var_cols.append(col)

# Drop columns
data.drop(low_var_cols, axis=1, inplace=True)

# Check
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36920 entries, 0 to 36919
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               36920 non-null  int64  
 1   conversation_id  36920 non-null  int64  
 2   created_at       36920 non-null  float64
 3   date             36920 non-null  object 
 4   place            14 non-null     object 
 5   tweet            36920 non-null  object 
 6   language         36920 non-null  object 
 7   hashtags         36920 non-null  object 
 8   cashtags         36920 non-null  object 
 9   user_id          36920 non-null  int64  
 10  user_id_str      36920 non-null  int64  
 11  username         36920 non-null  object 
 12  name             36920 non-null  object 
 13  day              36920 non-null  int64  
 14  hour             36920 non-null  int64  
 15  link             36920 non-null  object 
 16  urls             36920 non-null  object 
 17  photos      

In [None]:
# Remove columns because they offer little to the analysis
no_val_cols = ['created_at', 'place', 'user_id_str', 'username', 'day', 'hour']

# Drop
data.drop(no_val_cols, axis=1, inplace=True)

# Check
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36920 entries, 0 to 36919
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               36920 non-null  int64 
 1   conversation_id  36920 non-null  int64 
 2   date             36920 non-null  object
 3   tweet            36920 non-null  object
 4   language         36920 non-null  object
 5   hashtags         36920 non-null  object
 6   cashtags         36920 non-null  object
 7   user_id          36920 non-null  int64 
 8   name             36920 non-null  object
 9   link             36920 non-null  object
 10  urls             36920 non-null  object
 11  photos           36920 non-null  object
 12  video            36920 non-null  int64 
 13  thumbnail        6870 non-null   object
 14  nlikes           36920 non-null  int64 
 15  nreplies         36920 non-null  int64 
 16  nretweets        36920 non-null  int64 
 17  quote_url        2272 non-null 

In [None]:
# Remove non-English language tweets
data = data[data['language'] == 'en']

# Check
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19016 entries, 0 to 36918
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               19016 non-null  int64 
 1   conversation_id  19016 non-null  int64 
 2   date             19016 non-null  object
 3   tweet            19016 non-null  object
 4   language         19016 non-null  object
 5   hashtags         19016 non-null  object
 6   cashtags         19016 non-null  object
 7   user_id          19016 non-null  int64 
 8   name             19016 non-null  object
 9   link             19016 non-null  object
 10  urls             19016 non-null  object
 11  photos           19016 non-null  object
 12  video            19016 non-null  int64 
 13  thumbnail        3929 non-null   object
 14  nlikes           19016 non-null  int64 
 15  nreplies         19016 non-null  int64 
 16  nretweets        19016 non-null  int64 
 17  quote_url        1120 non-null 

In [None]:
# Function to remove tweets that only have URLs in them
def removeURLOnlyTweets(tweet):
  '''
  Uses regular expressions to identify tweets that only have URLs in them.
  '''
  # Regex pattern
  url_match_pattern = r'https?://\S+'
  # Check if the tweet contains only URLs
  if re.match(url_match_pattern, tweet) is not None:
    return True
  else:
    return False

# Remove tweets
data = data[data['tweet'].apply(removeURLOnlyTweets) == False]

# Check
data.shape

(19016, 20)

In [None]:
# Function to remove tweets that only have hashtags in them
def removeHashtagOnlyTweets(tweet):
  '''
  Uses regular expressions to identify tweets that only have hashtags in them.
  '''
  # Regex pattern
  hashtag_match_pattern = r'#[^\s#]+'
  # Check if the tweet contains only hashtags
  if re.match(hashtag_match_pattern, tweet) is not None:
    return True
  else:
    return False

# Remove tweets
data = data[data['tweet'].apply(removeHashtagOnlyTweets) == False]

# Check
data.shape

(18301, 20)

In [None]:
# Update index
data.reset_index(drop=True, inplace=True)

## Build and Apply Pipeline

The transformers library by HuggingFace will be used to build a multi-step pipeline. The important components will be:
1. Text classification
2. Sentiment analysis
3. Named entity recognition



In [None]:
# Import the spacy model
!python -m spacy download en_core_web_lg

In [None]:
# Set the pipeline object
nlp = spacy.load('en_core_web_lg')

In [None]:
# HuggingFace transformer pipeline elements
#multiclass_pipe = pipeline("text-classification", model="cardiffnlp/tweet-topic-21-multi") # Multi-topic tweet classification

In [None]:
# Create a custom spacy component for applying the multi-topic tweet classification transformer
#@Language.component("multiclassPipe")
#def multiclassPipe(doc):
  #'''
  #Applies multiclass_pipe to a doc object.
  #'''
  # Grab the text
  #tweet = doc.text
  # Process the text
  #multiclass_features = multiclass_pipe(tweet)
  # Add the transformer to the doc object's features
  #doc._.multiclass_features = multiclass_features
  #return doc

In [None]:
# Register the new extension with a getter function
#from spacy.tokens import Doc
#Doc.set_extension('multiclass_features', getter=multiclassPipe, force=True)

In [None]:
# Tweet sentiment pipeline element
tweet_sentiment = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest")

# Create a custom spacy component for applying the tweet sentiment transformer
@Language.component("sentimentPipe")
def sentimentPipe(doc):
  '''
  Applies tweet_sentiment to a doc object.
  '''
  # Grab the text
  tweet = doc.text
  # Process the text
  sentiment_analysis = tweet_sentiment(tweet)
  # Add the transformer to the doc object's features
  doc._.sentiment_analysis = sentiment_analysis
  return doc

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
# Register the new extension
from spacy.tokens import Doc
Doc.set_extension('sentiment_analysis', default=None, force=True)

In [None]:
# Add additional elements
nlp.add_pipe('emoji', first=True) # emoji tokenization
nlp.add_pipe('sentencizer', after='parser') # Ability to tokenize sentences
nlp.add_pipe('hf_text_pipe',
             config={'model': 'cardiffnlp/tweet-topic-21-multi'},
             after='ner') # Multitopic tweet classification
nlp.add_pipe('sentimentPipe', after='hf_text_pipe') # Tweet sentiment

# Components
nlp.components

config.json:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

[('emoji', <spacymoji.Emoji at 0x7c45c84663b0>),
 ('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7c474291a8c0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7c460d536e00>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7c45c3db20a0>),
 ('sentencizer', <spacy.pipeline.sentencizer.Sentencizer at 0x7c45f8b22100>),
 ('senter', <spacy.pipeline.senter.SentenceRecognizer at 0x7c4742918e20>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7c45f1d7e480>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7c45f30f6240>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7c45c3db1d90>),
 ('hf_text_pipe',
  <spacy_huggingface_pipelines.text_classification.HFTextPipe at 0x7c45f167a3c0>),
 ('sentimentPipe', <function __main__.sentimentPipe(doc)>)]

In [None]:
# Analyze the pipeline
pipeline_analysis = nlp.analyze_pipes()

# Check for problems
pipeline_analysis['problems']

{'emoji': [],
 'tok2vec': [],
 'tagger': [],
 'parser': [],
 'sentencizer': [],
 'attribute_ruler': [],
 'lemmatizer': [],
 'ner': [],
 'hf_text_pipe': [],
 'sentimentPipe': []}

In [None]:
# Calculating batches
batch_size = 1000
batches = math.ceil(data.shape[0]/batch_size)

# Show
print(f'Batches: {batches}')

Batches: 19


In [None]:
# Generate doc objects
%%time
docs = [doc for doc in nlp.pipe(data['tweet'], batch_size=batch_size)]

# Export
joblib.dump(docs, '/content/drive/MyDrive/Georgia Tech OMSA/ISYE6740/Project/outputs/docs.joblib')

CPU times: user 5h 34min 23s, sys: 1h 8min 22s, total: 6h 42min 46s
Wall time: 1h 49min 11s


['/content/drive/MyDrive/Georgia Tech OMSA/ISYE6740/Project/outputs/docs.joblib']

__Note__: The code above was updated for joblib to export to the correct location.

## Generate Emotions via LeXmo

In [None]:
# Iterate
for start in range(0, data.shape[0], batch_size):
  # Initialize tqdm with the total number of iterations
  progress_bar = tqdm_notebook(total=batch_size, desc="Processing")
  # Create the dictionary
  this_dict = defaultdict(dict)
  end = min(start + batch_size, data.shape[0])
  # Get the batch using iloc
  batch = data.iloc[start:end]
  # Process the batch using iterrows
  for index, row in batch.iterrows():
    # Process for emotions
    emotions = LeXmo.LeXmo(row.tweet)
    emotions.pop('text', None)
    this_dict[index] = emotions
    # Update progress bar
    progress_bar.update(1)
  # Close progress bar
  progress_bar.close()
  # Dump with joblib
  joblib.dump(this_dict, f"/content/drive/MyDrive/Georgia Tech OMSA/ISYE6740/Project/outputs/emotions_{start}_{end-1}.joblib")

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

## Extract Pipeline Elements

22-Feb-2024: Need to reload in data from joblib because of disconnect for the day.

In [None]:
# Load in docs
docs = joblib.load('/content/drive/MyDrive/Georgia Tech OMSA/ISYE6740/Project/outputs/docs.joblib')

#### Functions
 __Note:__ Move these to a file

In [None]:
# Extract emoji
def emojiExtractor(doc):
  '''
  For a document, isolate the emojis into a list.
  Return the emojis.
  '''
  # Retrieve the emojis
  emojis = doc._.emoji
  # Return the emojis
  return emojis

In [None]:
# Extract tokens of cleaned texts
def wordTokens(doc):
  '''
  For a document, isolate the tokens of a cleaned text into a list.
  Return the tokens.
  '''
  # Retrieve the tokens
  word_tokens = [token.text for token in doc if token.text.isalnum()]
  # Return the word tokens
  return word_tokens

In [None]:
# Extract sentence tokens
def sentTokenCount(doc):
  '''
  For a document, isolate the sentence tokens.
  Return the sentence tokens.
  '''
  # Retrieve the tokens
  sentence_tokens = [sent for sent in doc.sents]
  # Return the sentence tokens
  return sentence_tokens

In [None]:
# Extract part of speech tags
def posTagExtractor(doc):
  '''
  Return a list of Parts of Speech tags for a document.
  '''
  # Return a list of Parts of Speech tags
  return [token.pos_ for token in doc]

In [None]:
# Extract lemmas
def lemmaExtractor(doc):
  '''
  Return a list of lemmas
  '''
  return [token.lemma_ for token in doc]

In [None]:
# Extract NER tags
def nerExtractor(doc):
  '''
  Return a list of NER tags.
  '''
  return [(ent.text, ent.label_) for ent in doc.ents]

In [None]:
# Extract the classification elements
def multiclassExtractor(doc):
  '''
  Return a list of multiclass elements.
  '''
  return doc.cats

In [None]:
# Sentiment extraction
def sentimentExtractor(doc):
  '''
  Return a list of sentiment elements.
  '''
  return doc._.sentiment_analysis

In [None]:
# Create a dataframe
full_df = pd.DataFrame({'text': data['tweet']})

In [None]:
# Add the elements
full_df['emoji'] = [emojiExtractor(doc) for doc in docs]
full_df['pos_tags'] = [posTagExtractor(doc) for doc in docs] # Parts of speech tags
full_df['lemmas'] = [lemmaExtractor(doc) for doc in docs] # Lemmas
full_df['ner_tags'] = [nerExtractor(doc) for doc in docs] # NER tags
full_df['multiclass'] = [multiclassExtractor(doc) for doc in docs] # Multiclass elements
full_df['sentiment'] = [sentimentExtractor(doc) for doc in docs] # Sentiment elements

In [None]:
# Add to dataframe
full_df['word_tokens'] = [wordTokens(doc) for doc in docs]
full_df['word_counts'] = [len(l) for l in full_df['word_tokens']]

In [None]:
# Add to dataframe
full_df['sent_tokens'] = [sentTokenCount(doc) for doc in docs]
full_df['sent_counts'] = [len(l) for l in full_df['sent_tokens']]

In [None]:
# Check
full_df.head()

Unnamed: 0,text,emoji,pos_tags,lemmas,ner_tags,multiclass,sentiment,word_tokens,word_counts,sent_tokens,sent_counts
0,India gets a cheap scooter option #India #elec...,[],"[PROPN, VERB, DET, ADJ, NOUN, NOUN, ADP, PROPN...","[India, get, a, cheap, scooter, option, #, Ind...","[(India, GPE), (#India #, MONEY)]","{'science_&_technology': 0.7013693451881409, '...","[{'label': 'neutral', 'score': 0.5173878669738...","[India, gets, a, cheap, scooter, option, India...",8,"[(India, gets, a, cheap, scooter, option, #, I...",1
1,WHY LITHIUM? Demand for lithium is increas...,[],"[SCONJ, X, PUNCT, SPACE, NOUN, ADP, NOUN, AUX,...","[why, lithium, ?, , demand, for, lithium, ...","[(NWTT, ORG), (#NWTTmining #MiningBullies htt...","{'science_&_technology': 0.8292640447616577, '...","[{'label': 'neutral', 'score': 0.5242328643798...","[WHY, LITHIUM, Demand, for, lithium, is, incre...",38,"[(WHY, LITHIUM, ?, , Demand, for, lithium,...",3
2,@newdougman There’s a need for vehicles to tra...,[],"[NOUN, PRON, VERB, DET, NOUN, SCONJ, NOUN, PAR...","[@newdougman, there, ’, a, need, for, vehicle,...",[],"{'news_&_social_concern': 0.7064180970191956, ...","[{'label': 'neutral', 'score': 0.7223793268203...","[There, a, need, for, vehicles, to, transport,...",16,"[(@newdougman, There, ’s, a, need, for, vehicl...",1
3,Electric vehicles are growing in popularity no...,[],"[ADJ, NOUN, AUX, VERB, ADP, NOUN, ADV, PUNCT, ...","[electric, vehicle, be, grow, in, popularity, ...","[(BMW, ORG)]","{'science_&_technology': 0.9114550948143005, '...","[{'label': 'positive', 'score': 0.928408384323...","[Electric, vehicles, are, growing, in, popular...",19,"[(Electric, vehicles, are, growing, in, popula...",3
4,Our live demo on the stage of Web Summit Lisbo...,[],"[PRON, ADJ, NOUN, ADP, DET, NOUN, ADP, PROPN, ...","[our, live, demo, on, the, stage, of, Web, Sum...","[(Web Summit Lisbon, ORG), (10 days, DATE), (&...","{'science_&_technology': 0.9138903617858887, '...","[{'label': 'neutral', 'score': 0.5375091433525...","[Our, live, demo, on, the, stage, of, Web, Sum...",31,"[(Our, live, demo, on, the, stage, of, Web, Su...",3


In [None]:
# file path
file_path = '/content/drive/MyDrive/Georgia Tech OMSA/ISYE6740/Project/outputs/'

In [None]:
# Emotions master list
emo_master_dict = []

# Read the directory
for file in os.listdir(file_path):
  if file.startswith('emotions_'):
    # Read the file
    this_emo_dict = joblib.load(file_path + file)
    #Append to emo_master_list
    emo_master_dict.append(this_emo_dict)

In [None]:
# Start the emotions dataframe from the first dict, emotions_0_999
emo_df = pd.DataFrame.from_dict(emo_master_dict[0]).T

# Show
emo_df.head()

Unnamed: 0,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.0,0.0,0.0
1,0.018868,0.0,0.0,0.0,0.018868,0.037736,0.018868,0.0,0.0,0.018868
2,0.0,0.05,0.0,0.0,0.05,0.0,0.05,0.0,0.05,0.05
3,0.0,0.04,0.0,0.0,0.04,0.0,0.08,0.0,0.0,0.04
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


For each identified dictionary of emotions, iterate each nested dictionary and place respective values per index from the numeric key.

In [None]:
# Column names
emo_col_names = emo_df.columns

# Iterate outermost - skip the first dictionary at point 0
for emo_dict in emo_master_dict[1:]:
  # Iterate the nested dictionaries
  for emo_key in emo_dict:
    # Iterate the keys
    for col_name in emo_col_names:
      # Set the row in the column to the value of the corresponding column in the matching key
      emo_df.loc[emo_key, col_name] = emo_dict[emo_key][col_name]

# Show
emo_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18301 entries, 0 to 18300
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   anger         18301 non-null  float64
 1   anticipation  18301 non-null  float64
 2   disgust       18301 non-null  float64
 3   fear          18301 non-null  float64
 4   joy           18301 non-null  float64
 5   negative      18301 non-null  float64
 6   positive      18301 non-null  float64
 7   sadness       18301 non-null  float64
 8   surprise      18301 non-null  float64
 9   trust         18301 non-null  float64
dtypes: float64(10)
memory usage: 2.0 MB


In [None]:
# Save emo_df with joblib
joblib.dump(emo_df, '/content/drive/MyDrive/Georgia Tech OMSA/ISYE6740/Project/outputs/emo_df.joblib')

['/content/drive/MyDrive/Georgia Tech OMSA/ISYE6740/Project/outputs/emo_df.joblib']

### Merge Dataframes

In [None]:
# Combine full_df with emo_df
full_df = pd.merge(full_df, emo_df, left_index=True, right_index=True)

# Check
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18301 entries, 0 to 18300
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   text          18301 non-null  object 
 1   emoji         18301 non-null  object 
 2   pos_tags      18301 non-null  object 
 3   lemmas        18301 non-null  object 
 4   ner_tags      18301 non-null  object 
 5   multiclass    18301 non-null  object 
 6   sentiment     18301 non-null  object 
 7   word_tokens   18301 non-null  object 
 8   word_counts   18301 non-null  int64  
 9   sent_tokens   18301 non-null  object 
 10  sent_counts   18301 non-null  int64  
 11  anger         18301 non-null  float64
 12  anticipation  18301 non-null  float64
 13  disgust       18301 non-null  float64
 14  fear          18301 non-null  float64
 15  joy           18301 non-null  float64
 16  negative      18301 non-null  float64
 17  positive      18301 non-null  float64
 18  sadness       18301 non-nu

In [None]:
full_df.head(2)

Unnamed: 0,text,emoji,pos_tags,lemmas,ner_tags,multiclass,sentiment,word_tokens,word_counts,sent_tokens,...,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,India gets a cheap scooter option #India #elec...,[],"[PROPN, VERB, DET, ADJ, NOUN, NOUN, ADP, PROPN...","[India, get, a, cheap, scooter, option, #, Ind...","[(India, GPE), (#India #, MONEY)]","{'science_&_technology': 0.7013693451881409, '...","[{'label': 'neutral', 'score': 0.5173878669738...","[India, gets, a, cheap, scooter, option, India...",8,"[(India, gets, a, cheap, scooter, option, #, I...",...,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.0,0.0,0.0
1,WHY LITHIUM? Demand for lithium is increas...,[],"[SCONJ, X, PUNCT, SPACE, NOUN, ADP, NOUN, AUX,...","[why, lithium, ?, , demand, for, lithium, ...","[(NWTT, ORG), (#NWTTmining #MiningBullies htt...","{'science_&_technology': 0.8292640447616577, '...","[{'label': 'neutral', 'score': 0.5242328643798...","[WHY, LITHIUM, Demand, for, lithium, is, incre...",38,"[(WHY, LITHIUM, ?, , Demand, for, lithium,...",...,0.018868,0.0,0.0,0.0,0.018868,0.037736,0.018868,0.0,0.0,0.018868


#### Expand Multiclass Categories

In [None]:
# Get the keys for column values
multiclass_cols = full_df['multiclass'][0].keys()

In [None]:
# Create a multiclass dataframe
multiclass_df = pd.DataFrame(columns=multiclass_cols)

In [None]:
# Iterate to add values
for index, row in full_df.iterrows():
  # Get the dictionary
  this_dict = row['multiclass']
  # Iterate the keys
  for key in multiclass_cols:
    # Set the value
    multiclass_df.loc[index, key] = this_dict[key]

# Check
multiclass_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18301 entries, 0 to 18300
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   science_&_technology      18301 non-null  object
 1   business_&_entrepreneurs  18301 non-null  object
 2   travel_&_adventure        18301 non-null  object
 3   news_&_social_concern     18301 non-null  object
 4   other_hobbies             18301 non-null  object
 5   learning_&_educational    18301 non-null  object
 6   diaries_&_daily_life      18301 non-null  object
 7   film_tv_&_video           18301 non-null  object
 8   food_&_dining             18301 non-null  object
 9   gaming                    18301 non-null  object
 10  arts_&_culture            18301 non-null  object
 11  music                     18301 non-null  object
 12  fashion_&_style           18301 non-null  object
 13  youth_&_student_life      18301 non-null  object
 14  celebrity_&_pop_cultur

In [None]:
multiclass_df.head()

Unnamed: 0,science_&_technology,business_&_entrepreneurs,travel_&_adventure,news_&_social_concern,other_hobbies,learning_&_educational,diaries_&_daily_life,film_tv_&_video,food_&_dining,gaming,arts_&_culture,music,fashion_&_style,youth_&_student_life,celebrity_&_pop_culture,sports,family,fitness_&_health,relationships
0,0.701369,0.667302,0.407411,0.168905,0.102641,0.061035,0.046702,0.04317,0.034387,0.033693,0.031985,0.03028,0.019584,0.016662,0.014812,0.01148,0.00977,0.006419,0.006238
1,0.829264,0.789338,0.029812,0.738387,0.043394,0.045545,0.023964,0.018049,0.01419,0.011499,0.007,0.014668,0.004553,0.012153,0.017707,0.007431,0.005151,0.011765,0.00636
2,0.377337,0.529803,0.371921,0.706418,0.057103,0.06503,0.055797,0.011787,0.057155,0.010052,0.014578,0.010987,0.007539,0.019876,0.009816,0.008621,0.007206,0.009636,0.004385
3,0.911455,0.575176,0.175719,0.441231,0.087428,0.09117,0.045975,0.026381,0.025959,0.024049,0.021404,0.033546,0.008465,0.02552,0.013708,0.007525,0.008054,0.008303,0.009218
4,0.91389,0.50357,0.063532,0.161176,0.075994,0.091335,0.029815,0.061666,0.015161,0.012627,0.023123,0.045617,0.006156,0.017377,0.026286,0.008329,0.006534,0.006138,0.008212


In [None]:
# Save with joblib
joblib.dump(multiclass_df, '/content/drive/MyDrive/Georgia Tech OMSA/ISYE6740/Project/outputs/multiclass_df.joblib')

['/content/drive/MyDrive/Georgia Tech OMSA/ISYE6740/Project/outputs/multiclass_df.joblib']

In [None]:
# Merge with full_df
full_df = pd.merge(full_df, multiclass_df, left_index=True, right_index=True)

# Check
#full_df.info()

In [None]:
# Drop multiclass
full_df = full_df.drop(columns=['multiclass'])

# Check
#full_df.info()

In [None]:
# Sentiment columns
sentiment_cols = full_df['sentiment'][0][0].keys()

In [None]:
# Create a dataframe
sentiment_df = pd.DataFrame(columns=sentiment_cols)

In [None]:
# Iterate to add values
for index, row in full_df.iterrows():
  # Get the dictionary
  this_dict = row['sentiment'][0]
  # Iterate the keys
  for key in sentiment_cols:
    # Set the value
    sentiment_df.loc[index, key] = this_dict[key]

# Check
sentiment_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18301 entries, 0 to 18300
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   18301 non-null  object
 1   score   18301 non-null  object
dtypes: object(2)
memory usage: 945.0+ KB


In [None]:
# Export via joblib
joblib.dump(sentiment_df, '/content/drive/MyDrive/Georgia Tech OMSA/ISYE6740/Project/outputs/sentiment_df.joblib')

['/content/drive/MyDrive/Georgia Tech OMSA/ISYE6740/Project/outputs/sentiment_df.joblib']

In [None]:
# Merge with full_df
full_df = pd.merge(full_df, sentiment_df, left_index=True, right_index=True)

# Drop full_df['sentiment']
full_df = full_df.drop(columns=['sentiment'])


In [None]:
# Add the conversation id back to the dataframe
full_df.insert(0, 'conversation_id', data.conversation_id)

In [None]:
# Add the id back to the dataframe
full_df.insert(0, 'id', data.id)

In [None]:
# Convert lists of tokenized sentences to strings
full_df['sent_tokens'] = full_df['sent_tokens'].apply(lambda x: [' '.join(sentence) for sentence in x])

In [None]:
# Convert lists of tokenized sentences to strings
full_df['sent_tokens'] = full_df['sent_tokens'].apply(lambda x: [' '.join(token.text for token in sentence) for sentence in x])

In [None]:
# joblib
joblib.dump(test_df, '/content/drive/MyDrive/Georgia Tech OMSA/ISYE6740/Project/outputs/test_df.joblib')

['/content/drive/MyDrive/Georgia Tech OMSA/ISYE6740/Project/outputs/test_df.joblib']