In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
filename = 'train.ft.txt'  # Make sure this file is downloaded and available in the working directory
data = pd.read_csv(filename, sep='\t', header=None, names=['review'])

# Extract labels and clean reviews
data['label'] = data['review'].str.extract(r'(__label__\d)').astype(str)
data['label'] = data['label'].str[-1].astype(int)  # Convert label to integer
data['review'] = data['review'].str.replace(r'__label__\d ', '', regex=True)  # Remove label text from review

# Map the labels to Positive/Negative
data['label'] = data['label'].map({1: 'Negative', 2: 'Positive'})

In [None]:
# Exploratory Data Analysis (EDA)

# Plot 1: Histogram of Review Lengths

# Calculate review lengths
data['review_length'] = data['review'].apply(len)

# Plot histogram for review lengths
plt.figure(figsize=(10, 5))
sns.histplot(data['review_length'], bins=50, kde=True)
plt.title('Distribution of Review Lengths')
plt.xlabel('Review Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Plot 2 - Bar Chart for Label Distribution

# Plot count of each label
plt.figure(figsize=(8, 5))
sns.countplot(x='label', data=data)
plt.title('Distribution of Positive and Negative Reviews')
plt.xlabel('Review Sentiment')
plt.ylabel('Count')
plt.show()

In [None]:
# Plot 3 - Heatmap of Correlation Between Review Length and Sentiment

# Encode labels as binary
data['label_binary'] = data['label'].apply(lambda x: 1 if x == 'Positive' else 0)

# Calculate correlation matrix
correlation_matrix = data[['review_length', 'label_binary']].corr()

# Plot heatmap of the correlation
plt.figure(figsize=(6, 4))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', cbar=True)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Descriptive Statistics

# Summary statistics for review lengths
data['review_length'].describe()

# Correlation matrix
data[['review_length', 'label_binary']].corr()