In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

nltk.download('stopwords')
nltk.download('punkt')
print(stopwords.words('english'))

import matplotlib.pyplot as plt
from wordcloud import WordCloud

!pip install transformers
from transformers import pipeline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/asurion/asurion_preprocess.csv')

In [4]:
df.isna().sum()

titre              0
nom                8
stars              0
localisation       0
nb_reviews         0
date_review        0
date_experience    0
comment            0
dtype: int64

In [5]:
df = df[df.nom.isna() == False]

In [6]:
df_rating_distr = pd.DataFrame(df.stars.value_counts()) \
                    .reset_index() \
                    .rename(columns={"index": "stars", "stars":"nb_comments"}) \
                    .sort_values(by = 'stars')

fig = px.bar(x = df_rating_distr['stars'],
             y = df_rating_distr['nb_comments'],
             text_auto = '.2s',
             title="Distribution of comments by ratings",
             labels={
                     "x": "Rating",
                     "y": "Number of comments"
                 },)
fig.update_xaxes(type='category')
fig.update_traces(textfont_size=12, textangle=0, textposition="outside")
fig.update_layout(plot_bgcolor='white')
fig.show()

In [7]:
df_loc_distr = pd.DataFrame(df.localisation.value_counts()) \
                    .reset_index() \
                    .rename(columns={"index": "localisation", "localisation":"nb_comments"}) \
                    .sort_values(by = 'nb_comments', ascending = False) \
                    .head(20)

fig = px.bar(x = df_loc_distr['localisation'],
             y = df_loc_distr['nb_comments'],
             text_auto = '.2s',
             log_y = True,
             title="Distribution of comments by localisation (on a LOG scale)",
             labels={
                     "x": "Country",
                     "y": "Number of comments"
                 })
#fig.update_xaxes(type='category')
fig.update_traces(textfont_size=12, textangle=0, textposition="outside")
fig.update_layout(plot_bgcolor='white')
fig.show()

In [8]:
df_nb_rev_distr = pd.DataFrame(df.nb_reviews.value_counts()) \
                    .reset_index() \
                    .rename(columns={"index": "nb_reviews", "nb_reviews":"nb_comments"}) \
                    .sort_values(by = 'nb_comments', ascending = False) \
                    .head(10)

fig = px.bar(x = df_nb_rev_distr['nb_reviews'],
             y = df_nb_rev_distr['nb_comments'],
             text_auto = '.2s',
             log_y = True,
             title="Distribution of comments based on how many comments were left by clients (on a LOG scale)",
             labels={
                     "x": "Number of reviews left by clients",
                     "y": "Number of comments"
                 })
fig.update_xaxes(type='category')
fig.update_traces(textfont_size=12, textangle=0, textposition="outside")
fig.update_layout(plot_bgcolor='white')
fig.show()

In [9]:
df = df[df["nb_reviews"] != 0]

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72886 entries, 0 to 72895
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   titre            72886 non-null  object
 1   nom              72886 non-null  object
 2   stars            72886 non-null  int64 
 3   localisation     72886 non-null  object
 4   nb_reviews       72886 non-null  int64 
 5   date_review      72886 non-null  object
 6   date_experience  72886 non-null  object
 7   comment          72886 non-null  object
dtypes: int64(2), object(6)
memory usage: 5.0+ MB


In [11]:
df_revs_stars = pd.crosstab(df.nb_reviews, df.stars, dropna=False, margins = True)
df_revs_stars

stars,1,2,3,4,5,All
nb_reviews,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,6207,1621,2190,5114,40734,55866
2,998,295,457,988,8726,11464
3,269,81,127,292,2584,3353
4,103,34,56,120,913,1226
5,38,15,15,39,382,489
6,17,8,9,15,184,233
7,9,4,4,10,96,123
8,11,3,0,3,63,80
9,4,1,4,2,41,52
All,7656,2062,2862,6583,53723,72886


In [12]:
#df_revs_stars_pct = pd.crosstab(df.nb_reviews, df.stars, dropna=False, margins=True, normalize="index")
#df_revs_stars_pct

In [13]:
data = df.drop(columns = ["stars"])
target = df["stars"]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(data, target,
                                                    test_size = 0.2,
                                                    random_state = 1,
                                                    shuffle = True,
                                                    stratify = target
                                                   )

In [15]:
def trasnform_date_columns(df):
    """
    Transforms date columns in a DataFrame by extracting year, month, day, and hour information.

    Args:
        df (pandas.DataFrame): The input DataFrame.

    Returns:
        pandas.DataFrame: The modified DataFrame with transformed date columns.
    """
    df["date_review"] = pd.to_datetime(df["date_review"])
    df["date_experience"] = pd.to_datetime(df["date_experience"])

    df['date_review_year'] = df["date_review"].dt.year.astype(int)
    df['date_review_month'] = df["date_review"].dt.month.astype(int)
    df['date_review_day'] = df["date_review"].dt.day.astype(int)
    df['date_review_hour'] = df["date_review"].dt.hour.astype(int)

    df['date_experience_year'] = df["date_experience"].dt.year.astype(int)
    df['date_experience_month'] = df["date_experience"].dt.month.astype(int)
    df['date_experience_day'] = df["date_experience"].dt.day.astype(int)
    return df

In [16]:
def delete_useless_columns(df, columns_to_delete):
    """
    Delete specified columns from a DataFrame.

    Args:
        data (pandas.DataFrame): The DataFrame to modify.
        columns_to_delete (list): A list of column names to delete.

    Returns:
        pandas.DataFrame: The modified DataFrame with specified columns removed.
    """
    df = df.drop(columns = columns_to_delete)
    return df

In [17]:
def add_comment_stat(df, text_column):
    """
    Adds character count and word count columns to a DataFrame based on a specified text column.

    Args:
        df (pandas.DataFrame): The input DataFrame.
        text_column (str): The name of the column containing the text data.

    Returns:
        pandas.DataFrame: The modified DataFrame with added 'char_count' and 'word_count' columns.
    """
    df['char_count'] = df[text_column].str.len()
    df['word_count'] = df[text_column].str.split().str.len()
    return df

In [18]:
def average_rating_per_loc(df, target):
    df = pd.concat([df,target], axis = 1)
    return pd.DataFrame(df.groupby(["localisation"])["stars"].mean())

df_avg_rating_per_loc = average_rating_per_loc(X_train, y_train)

In [19]:
def delete_stop_words(df):
    """
    Deletes stop words from the column "comment" in the chosen DataFrame.

    Args:
        df (pandas.DataFrame): The input DataFrame.

    Returns:
        pandas.DataFrame: The modified DataFrame with stop words removed from the 'comment' column."""

    stop_words = set(stopwords.words('english'))
    df["comment_no_stopword"] = df['comment'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

In [20]:
def delete_punctiation(df, comment_col):
    """
    Removes punctuation from the specified column of a DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame containing the text column.
        comment_col (str): The name of the column to remove punctuation from.

    Returns:
        None. The DataFrame is modified in-place.

    """
    pattern = r'[^a-zA-Z0-9\s]'
    df[comment_col] = df[comment_col].replace(pattern, ' ', regex=True)
    df[comment_col] = df[comment_col].str.replace('  ', ' ')

In [21]:
stemmer = PorterStemmer()

def stem_text(text):
    tokens = word_tokenize(text)  # Tokenize the text into words
    stemmed_words = [stemmer.stem(word) for word in tokens]  # Apply stemming to each word
    return ' '.join(stemmed_words)  # Return the stemmed words as a string

In [22]:
def display_wordcloud_from_column(df, target_col, comment_col, rating):
    """
    Displays a word count cloud graph based on the specified column in a DataFrame.

    Args:
        df (pandas.DataFrame): The input DataFrame.
        column_name (str): The name of the column containing the comments.
    """
    df = pd.concat([df,target_col])
    df = df[df["stars"] == rating]
    comment_text = ' '.join(df[comment_col].astype(str).tolist())
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(comment_text)

    print("Cloud of most used words in comments for "+ str(rating) +" ratings :")
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [23]:
def most_used_words(df, col, rating):
    """
    Counts the most frequently used words in a specified column of a DataFrame based on a given rating.

    Args:
        df (pandas.DataFrame): The DataFrame containing the text column.
        col (str): The name of the column to analyze.
        rating: The rating to filter the DataFrame by.

    Returns:
        pandas.DataFrame: A DataFrame containing the most frequently used words and their occurrence count.

    """
    l1 = ' '.join(df.loc[df.stars == rating, col]).split()
    df_word_count = pd.DataFrame(pd.Series(l1).value_counts()).reset_index().head(15)
    df_word_count.columns = ['word','nb_occurence']
    return df_word_count

In [24]:
trasnform_date_columns(X_train)
add_comment_stat(X_train, "comment")
delete_punctiation(X_train, "comment")
X_train = delete_useless_columns(X_train, ["titre", "nom", "date_review", "date_experience"])
X_train = X_train.merge(df_avg_rating_per_loc, on = "localisation", how = "left")
delete_stop_words(X_train)
X_train['stemmed_comment'] = X_train['comment'].apply(stem_text)
X_train['stemmed_comment_no_stop_words'] = X_train['comment_no_stopword'].apply(stem_text)


trasnform_date_columns(X_test)
add_comment_stat(X_test, "comment")
delete_punctiation(X_test, "comment")
X_test = delete_useless_columns(X_test, ["titre", "nom", "date_review", "date_experience"])
X_test = X_test.merge(df_avg_rating_per_loc, on = "localisation", how = "left")
delete_stop_words(X_test)
X_test['stemmed_comment'] = X_test['comment'].apply(stem_text)
X_test['stemmed_comment_no_stop_words'] = X_test['comment_no_stopword'].apply(stem_text)

In [None]:
display_wordcloud_from_column(X_train, y_train, 'comment_no_stopword', 5)
display_wordcloud_from_column(X_train, y_train, 'comment_no_stopword', 4)
display_wordcloud_from_column(X_train, y_train, 'comment_no_stopword', 3)
display_wordcloud_from_column(X_train, y_train, 'comment_no_stopword', 2)
display_wordcloud_from_column(X_train, y_train, 'comment_no_stopword', 1)

In [None]:
display(most_used_words(X_train, "comment_no_stopword", 5))
display(most_used_words(X_train, "comment_no_stopword", 1))

In [27]:
print(X_train.shape)
print(X_test.shape)

(58308, 16)
(14578, 16)


In [28]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

In [29]:
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

In [31]:
def get_sentiment_score_for_text(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return scores

In [32]:
train_index_to_delete = X_train[X_train.word_count >= 500].index
test_index_to_delete = X_test[X_test.word_count >= 500].index

In [33]:
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

In [34]:
y_train = y_train[y_train.index.isin(train_index_to_delete) == False].copy()
y_test = y_test[y_test.index.isin(test_index_to_delete) == False].copy()

In [35]:
X_train = X_train[X_train.word_count < 500]
X_test = X_test[X_test.word_count < 500]

In [36]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

y_train.to_csv("y_train.csv")
y_test.to_csv("y_test.csv")

(58270, 16)
(58270,)
(14565, 16)
(14565,)


In [None]:
X_train['neutre'] = 0.0
X_train['negative'] = 0.0
X_train['positive'] = 0.0

X_train[['neutre', 'negative', 'positive']] = X_train['comment_no_stopword'].apply(lambda text: pd.Series(get_sentiment_score_for_text(text)))

In [None]:
X_train.head()
X_train.to_csv("X_train.csv")

In [None]:
X_test['neutre'] = 0.0
X_test['negative'] = 0.0
X_test['positive'] = 0.0

X_test[['neutre', 'negative', 'positive']] = X_test['comment_no_stopword'].apply(lambda text: pd.Series(get_sentiment_score_for_text(text)))

In [None]:
X_test.head()
X_test.to_csv("X_test.csv")