# Data Loading

In [22]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Read first dataset
df1 = pd.read_csv("dataset/deceptive-opinion.csv")
df1.head()

Unnamed: 0,deceptive,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...
4,truthful,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...


In [3]:
# Read second dataset
df2 = pd.read_csv("dataset/fake reviews dataset.csv")
df2.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [4]:
# Read third dataset
df3 = pd.read_csv("dataset/reviews_dataset.csv")
df3.head()

Unnamed: 0,Review_Text,Rating,Label
0,"The service was terrible, but the food was good.",3,1
1,"Very disappointing, I expected better.",1,1
2,Fantastic! Exceeded my expectations!,1,0
3,This product is amazing! I love it so much!,2,0
4,"The service was terrible, but the food was good.",2,1


In [5]:
# Change the label to ensure consistency
df1['Label'] = df1['deceptive'].map({'truthful': 0, 'deceptive': 1})
df1.head()

Unnamed: 0,deceptive,hotel,polarity,source,text,Label
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...,0
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...,0
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...,0
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...,0
4,truthful,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...,0


In [6]:
# Change the label to ensure consistency
df2['Label'] = df2['label'].map({'CG': 1, 'OR': 0})
df2.head()

Unnamed: 0,category,rating,label,text_,Label
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor...",1
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I...",1
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...,1
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i...",1
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...,1


0 -> True, 1 -> Fake

In [7]:
df1_processed = df1[['text', 'Label']].rename(columns={'Label': 'label'})
df2_processed = df2[['text_', 'Label']].rename(columns={'text_': 'text', 'Label': 'label'})
df3_processed = df3[['Review_Text', 'Label']].rename(columns={'Review_Text': 'text', 'Label':'label'})

combined_df = pd.concat([df1_processed, df2_processed, df3_processed], ignore_index=True)
combined_df.head()

Unnamed: 0,text,label
0,We stayed for a one night getaway with family ...,0
1,Triple A rate with upgrade to view room was le...,0
2,This comes a little late as I'm finally catchi...,0
3,The Omni Chicago really delivers on all fronts...,0
4,I asked for a high floor away from the elevato...,0


In [8]:
combined_df.dtypes

text     object
label     int64
dtype: object

# Data Cleaning

In [9]:
combined_df.isna().sum()

text     0
label    0
dtype: int64

In [10]:
combined_df.duplicated().sum()

994

In [11]:
combined_df.drop_duplicates()

Unnamed: 0,text,label
0,We stayed for a one night getaway with family ...,0
1,Triple A rate with upgrade to view room was le...,0
2,This comes a little late as I'm finally catchi...,0
3,The Omni Chicago really delivers on all fronts...,0
4,I asked for a high floor away from the elevato...,0
...,...,...
42088,Awful! I will never return.,0
42103,Terrible experience. The product broke on day 1.,0
42107,Awful! I will never return.,1
42121,This is the worst product ever!,1


In [12]:
combined_df['label'].value_counts()

label
0    21569
1    21463
Name: count, dtype: int64

In [13]:
combined_df.describe()

Unnamed: 0,label
count,43032.0
mean,0.498768
std,0.500004
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


# Text Cleaning

In [14]:
def light_clean(text):
    return text.replace('\n', ' ').strip()  # Only basic cleanup

combined_df['clean_text'] = combined_df['text'].astype(str).apply(light_clean)

In [15]:
combined_df.drop(columns=['text'], inplace=True)
combined_df.rename(columns={'clean_text': 'text'}, inplace=True)
combined_df.to_csv("processed_data/cleaned_data.csv", header=True, index=False)

In [23]:
df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

train_df, temp_df = train_test_split(df, test_size=0.8, stratify=df['label'], random_state=42)
test_df, _ = train_test_split(temp_df, test_size=(len(temp_df) - 5000), stratify=temp_df['label'], random_state=42)

train_df = train_df.sample(n=5000, random_state=42) if len(train_df) > 5000 else train_df
test_df = test_df.sample(n=5000, random_state=42) if len(test_df) > 5000 else test_df

train_df.to_csv("processed_data/train.csv", index=False)
test_df.to_csv("processed_data/test.csv", index=False)
