In [0]:
!pip install transformers

In [0]:
!pip install torch

In [0]:
!pip install langdetect

In [0]:
import pyspark
from transformers import pipeline
import pandas as pd
from langdetect import detect, DetectorFactory
import matplotlib.pyplot as plt

In [0]:
%sql

SELECT * FROM adb_hibak.silver_tf2_chat;

In [0]:
# Converting spark DF to pandas DF
df_chat = _sqldf
df_chat = df_chat.toPandas()

In [0]:
# Creating a copy DF that will only deal with date and not time. 
df_chat_date = df_chat.copy()

In [0]:
# Remove time from datetime in eventTime column
df_chat_date['eventTime'] = pd.to_datetime(df_chat_date['eventTime']).dt.date

In [0]:
# Remove all rows with empty values in message.
df_chat = df_chat[df_chat['message'].str.strip() != '']
df_chat_date = df_chat_date[df_chat_date['message'].str.strip() != '']

In [0]:
df_chat.display()

In [0]:
df_chat_date.display()

In [0]:
chat_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
    return_all_scores=True
)

In [0]:
df_chat.info()

In [0]:
# Convert messages to string in both DFs

df_chat = df_chat.copy()
df_chat.loc[:, 'message'] = df_chat['message'].astype(str)

df_chat_date = df_chat_date.copy()
df_chat_date.loc[:, 'message'] = df_chat_date['message'].astype(str)

In [0]:
# Create a new column with the sentiment scores.

df_chat['sentiment_scores'] = df_chat['message'].apply(chat_sentiment_classifier)

In [0]:
# Extract the sentiment scores in the correct format.
# Original output from chat_sentiment_classifier returns a list in a list with three dictionaries with label positive, neutral and negative

def extract_sentiment_info(sentiment_scores):
    if sentiment_scores:
        sentiment_info = sentiment_scores[0]
        return {
            'positive': sentiment_info[0]['score'],
            'neutral': sentiment_info[1]['score'],
            'negative': sentiment_info[2]['score']
        }
    else:
        return {
            'positive': None,
            'neutral': None,
            'negative': None
        }

In [0]:
df_chat[['positive', 'neutral', 'negative']] = df_chat['sentiment_scores'].apply(extract_sentiment_info).apply(pd.Series)

In [0]:
df_chat.display()

In [0]:
df_chat.drop('sentiment_scores', axis=1, inplace=True)

In [0]:
df_chat.display()

In [0]:
# Set a seed for the langdetect detector for reproducibility
DetectorFactory.seed = 0

# Function to detect language, with error handling
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'

In [0]:
# Create a new language column with the detected language for each message

df_chat['language'] = df_chat['message'].apply(detect_language)


In [0]:
df_chat.head()

In [0]:
language_fullname = {
    'af': 'Afrikaans',
    'bg': 'Bulgarian',
    'ca': 'Catalan',
    'cs': 'Czech',
    'cy': 'Welsh',
    'da': 'Danish',
    'de': 'German',
    'en': 'English',
    'es': 'Spanish',
    'et': 'Estonian',
    'fi': 'Finnish',
    'fr': 'French',
    'hr': 'Croatian',
    'hu': 'Hungarian',
    'id': 'Indonesian',
    'it': 'Italian',
    'lt': 'Lithuanian',
    'lv': 'Latvian',
    'mk': 'Macedonian',
    'nl': 'Dutch',
    'no': 'Norwegian',
    'pl': 'Polish',
    'pt': 'Portuguese',
    'ro': 'Romanian',
    'ru': 'Russian',
    'sk': 'Slovak',
    'sl': 'Slovenian',
    'so': 'Somali',
    'sq': 'Albanian',
    'sv': 'Swedish',
    'sw': 'Swahili',
    'tl': 'Tagalog',
    'tr': 'Turkish',
    'uk': 'ukrainian',
    'uknown': 'unknown',
    'vi': 'Vietnamese'
}

In [0]:
df_chat['language'] = df_chat['language'].replace(language_fullname)

In [0]:
df_chat.display()

In [0]:
df_chat.groupby('language').size()

In [0]:
# Count the number of languages.

language_counts = df_chat['language'].value_counts()

In [0]:
# Create a bar chart
plt.figure(figsize=(10, 6))
language_counts.plot(kind='bar')
plt.title('Occurrence of Each Language in Chat Messages')
plt.xlabel('Language')
plt.ylabel('Number of Messages')
plt.xticks(rotation=45)
plt.show()

In [0]:
# Create a dataframe with only English messages
df_chat_en = df_chat[df_chat['language'] == 'English']

In [0]:
df_chat_en.display()

In [0]:
# Convert the pandas DF to a spark DF
spark_df_chat_en = spark.createDataFrame(df_chat_en)
spark_df_chat_en.display()

In [0]:
# Convert the spark DF to a table in the db.
spark_df_chat_en.write.mode('overwrite').saveAsTable("adb_hibak.silver_tf2_chat_time_sentiment_analysis_en")

In [0]:
# Convert the pandas DF to a spark DF
spark_df_chat_all_languages = spark.createDataFrame(df_chat)
spark_df_chat_all_languages.display()

In [0]:
# Convert the spark DF to a table in the db.
spark_df_chat_all_languages.write.mode('overwrite').saveAsTable("adb_hibak.silver_tf2_chat_time_sentiment_analysis_all_languages")