In [5]:
# Import standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set a style for all our plots
sns.set_style("whitegrid")

# Load the training data
# Double-check the exact path/filename from the files you unzipped
try:
    df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    sample_submission = pd.read_csv("sample_submission.csv")
except FileNotFoundError:
    print("Make sure your CSV files are in the same directory as your notebook or provide the correct path.")

In [6]:
# See the first 5 rows to understand the columns
df.head()

Unnamed: 0,row_id,body,rule,subreddit,positive_example_1,positive_example_2,negative_example_1,negative_example_2,rule_violation
0,0,Banks don't want you to know this! Click here ...,"No Advertising: Spam, referral links, unsolici...",Futurology,If you could tell your younger self something ...,hunt for lady for jack off in neighbourhood ht...,Watch Golden Globe Awards 2017 Live Online in ...,"DOUBLE CEE x BANDS EPPS - ""BIRDS""\n\nDOWNLOAD/...",0
1,1,SD Stream [ ENG Link 1] (http://www.sportsstre...,"No Advertising: Spam, referral links, unsolici...",soccerstreams,[I wanna kiss you all over! Stunning!](http://...,LOLGA.COM is One of the First Professional Onl...,#Rapper \n🚨Straight Outta Cross Keys SC 🚨YouTu...,[15 Amazing Hidden Features Of Google Search Y...,0
2,2,Lol. Try appealing the ban and say you won't d...,No legal advice: Do not offer or request legal...,pcmasterrace,Don't break up with him or call the cops. If ...,It'll be dismissed: https://en.wikipedia.org/w...,Where is there a site that still works where y...,Because this statement of his is true. It isn'...,1
3,3,she will come your home open her legs with an...,"No Advertising: Spam, referral links, unsolici...",sex,Selling Tyrande codes for 3€ to paypal. PM. \n...,tight pussy watch for your cock get her at thi...,NSFW(obviously) http://spankbang.com/iy3u/vide...,Good News ::Download WhatsApp 2.16.230 APK for...,1
4,4,code free tyrande --->>> [Imgur](http://i.imgu...,"No Advertising: Spam, referral links, unsolici...",hearthstone,wow!! amazing reminds me of the old days.Well...,seek for lady for sex in around http://p77.pl/...,must be watch movie https://sites.google.com/s...,We're streaming Pokemon Veitnamese Crystal RIG...,1


In [7]:
# Get a concise summary of the dataframe
# This is crucial for checking data types and missing values!
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2029 entries, 0 to 2028
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   row_id              2029 non-null   int64 
 1   body                2029 non-null   object
 2   rule                2029 non-null   object
 3   subreddit           2029 non-null   object
 4   positive_example_1  2029 non-null   object
 5   positive_example_2  2029 non-null   object
 6   negative_example_1  2029 non-null   object
 7   negative_example_2  2029 non-null   object
 8   rule_violation      2029 non-null   int64 
dtypes: int64(2), object(7)
memory usage: 142.8+ KB


In [8]:
# How many rows and columns are we working with?
print(f"Training data shape: {df.shape}")

Training data shape: (2029, 9)


In [None]:
# Check the distribution of the target variable
label_counts = df['label'].value_counts()
print(label_counts)

# Visualize the distribution
plt.figure(figsize=(6, 4))
sns.barplot(x=label_counts.index, y=label_counts.values)
plt.title('Distribution of Rule-Breaking vs. Non-Rule-Breaking Comments')
plt.ylabel('Number of Comments')
plt.xlabel('Label (1 = Violation, 0 = No Violation)')
plt.xticks([0, 1])
plt.show()

In [None]:
# Display a few examples of rule-breaking comments
print("--- Examples of Rule-Breaking Comments ---")
for text in df[df['label'] == 1]['comment_text'].head().values:
    print(f"- {text}\n")

# Display a few examples of comments that are fine
print("--- Examples of Non-Rule-Breaking Comments ---")
for text in df[df['label'] == 0]['comment_text'].head().values:
    print(f"- {text}\n")```

**6. Start Forming Hypotheses with Simple Feature Engineering**

Your intuition is your best tool here. Let's test a simple idea: "Are rule-breaking comments longer or shorter?"

```python
# Create a new feature for the length of the comment
df['comment_length'] = df['comment_text'].str.len()

# Compare the statistics of comment length for each class
df.groupby('label')['comment_length'].describe()

In [None]:
# Visualize this comparison with a histogram
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='comment_length', hue='label', bins=50, kde=True)
plt.title('Distribution of Comment Length by Label')
plt.xlabel('Comment Length (Number of Characters)')
plt.show()