# Integrated CA ML for Business and Data Visualisation

In [None]:
#!pip install jupyter_dash
#!pip install dash

In [None]:
#General
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output


# Time Series
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
import statsmodels.tsa.api as sm
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Text Analytics
import sklearn.feature_extraction.text as sk_txt
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [None]:
df = pd.read_csv('FACEBOOK_REVIEWS.csv')

# Data Pre-Processing

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
df['review_timestamp'] = pd.to_datetime(df['review_timestamp'])

In [None]:
df['review_timestamp'].min(), df['review_timestamp'].max()

In [None]:
df.isna().sum()

In [None]:
df.dropna(inplace=True)
df.isna().sum()

In [None]:
df.info()

## Data Visualisation Part 1

In [None]:
df.describe()

#### Histplot with the count of review ratings 

In [None]:
sns.set_style("white") 
sns.set_palette('deep') 
bins = list(set(df['review_rating'].unique()))
ax = sns.histplot(df['review_rating'], bins=5, kde=False)
plt.ticklabel_format(style='plain', axis='y')
ax.set_xticks(bins)
plt.show()

#### Bar Chart with count of review ratings per year

In [None]:
df['Year'] = df['review_timestamp'].dt.year
year = df.groupby(['Year','review_rating']).review_rating.value_counts().reset_index()

In [None]:
fig = px.bar(year,
            x= 'Year',
            y= 'count',
            color = 'review_rating',
            hover_data = {
                'count': ':,.0f'
            })
fig.show()

#### Scatterplot and Trendline with count of review likes by review rating

In [None]:
likes = df.groupby('review_rating')['review_likes'].sum().reset_index()

In [None]:
fig = px.scatter(likes,
         x='review_rating',
         y='review_likes',
         hover_data={
            'review_rating': ':,.f'
         },
         labels={
            'review_rating': 'Rating'
         },
         trendline='ols')
fig.show()

### Feature Creation for Time Series Analysis

In [None]:
print(stop)

In [None]:
df['year'] = df['review_timestamp'].dt.year
df_2022_23 = df[(df['year'] == 2022) | (df['year'] == 2023)]


In [None]:
df_2022_23['week'] = df_2022_23['review_timestamp'].dt.isocalendar().week


In [None]:
df_2022_23['date']= df_2022_23['review_timestamp'].dt.date
df_2022_23['date'] = pd.to_datetime(df_2022_23['date'])
df_2022_23['date'] 

In [None]:
unique_reviews_per_day = df_2022_23.groupby(['date'])['review_id'].nunique().reset_index(name='unique_reviews')
unique_reviews_per_day

In [None]:
unique_reviews_per_day.plot(x="date", y="unique_reviews")

In [None]:
adfuller(unique_reviews_per_day['unique_reviews'])

In [None]:
plot_acf(unique_reviews_per_day["unique_reviews"].dropna(),lags=20)
# q = has a significant coefficient until lag 10

In [None]:
plot_pacf(unique_reviews_per_day["unique_reviews"].dropna(),lags=10)
# p = 1 has a significant correlation with the previous value

In [None]:
from statsmodels.tsa.arima.model import ARIMA
aicVals = []
for p in range(0,1):
    p = 1
    for d in range(5):
        for q in range(5,10):
            print(p, d, q)
            
            try:
                aic = ARIMA(unique_reviews_per_day["unique_reviews"], order=(p, d, q)).fit().aic
                aicVals.append([aic, p, d, q])
            except np.linalg.LinAlgError:
                print(f"Unable to fit model for order=({p}, {d}, {q})")

In [None]:
model = ARIMA(unique_reviews_per_day["unique_reviews"], order=(1, 0, 5)).fit()

In [None]:
model.summary()

In [None]:
model = ARIMA(unique_reviews_per_day["unique_reviews"], order=(1, 0, 2)).fit()

In [None]:
model.summary()

In [None]:
pred_time = 1 # predict 1 time step into the future
result = model.predict(start=unique_reviews_per_day["unique_reviews"].shape[0]-10, end=unique_reviews_per_day["unique_reviews"].shape[0] + pred_time).reset_index(drop=True)

In [None]:
result

In [None]:
predictions = result.iloc[:-1]
predictions

In [None]:
actual_last10 = unique_reviews_per_day.tail(11).reset_index(drop=True)
actual_last10

In [None]:
# Calculate RMSE
rmse = sqrt(mean_squared_error(actual_last10['unique_reviews'], predictions))
print('Test RMSE: %.3f' % rmse)

In [None]:
from sklearn.metrics import r2_score
# Calculate R^2 score
r2 = r2_score(actual_last10['unique_reviews'], predictions)
print('Test R^2: %.3f' % r2)

In [None]:
unique_reviews_per_day

In [None]:
# Split into train and test sets
size = int(len(unique_reviews_per_day["unique_reviews"]) * 0.7)
train, test = unique_reviews_per_day["unique_reviews"][0:size], unique_reviews_per_day["unique_reviews"][size:len(unique_reviews_per_day)]

In [None]:
train.shape, test.shape

In [None]:
model = ARIMA(train, order=(1, 0, 2)).fit()

#### Bad Result fitting the model to the raw training data. data preperation is required to solve this.

In [None]:
# Make predictions
predict = model.predict(start=len(train), end=len(train)+len(test)-1, dynamic=False)

# Calculate RMSE
rmse = sqrt(mean_squared_error(test, predict))
print('Test RMSE: %.3f' % rmse)

from sklearn.metrics import r2_score
# Calculate R^2 score
r2 = r2_score(test, predict)
print('Test R^2: %.3f' % r2)

In [None]:
Q1 = unique_reviews_per_day['unique_reviews'].quantile(0.25)
Q3 = unique_reviews_per_day['unique_reviews'].quantile(0.75)
IQR = Q3 - Q1

# Create a new DataFrame excluding outliers
df_clean = unique_reviews_per_day[~((unique_reviews_per_day['unique_reviews'] < (Q1 - 1.5 * IQR)) |(unique_reviews_per_day['unique_reviews'] > (Q3 + 1.5 * IQR)))]
df_clean

In [None]:
df_clean.plot(x="date", y="unique_reviews")

In [None]:
adfuller(df_clean['unique_reviews'])
# p-value < 0.05, so we can reject the null hypothesis that the data is not stationary

In [None]:
df_clean

In [None]:
# Split into train and test sets
train_size = int(len(df_clean["unique_reviews"]) * 0.7)
train, test = df_clean["unique_reviews"][0:train_size], df_clean["unique_reviews"][train_size:len(df_clean)]

In [None]:
train.shape, test.shape

In [None]:
# Fit the model
model = ARIMA(train, order=(2, 1, 1)).fit()
model.summary()

In [None]:
predictions = model.predict(start=len(train), end=len(train)+len(test)-1, dynamic=False)

mae = mean_absolute_error(test, predictions)
print('Test MAE: %.2f'% mae)

rmse = sqrt(mean_squared_error(test, predictions))
print('Test RMSE: %.3f' % rmse)

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = mean_absolute_percentage_error(test, predictions)
print('Test MAPE: %.2f' % mape)