# Checking quality of the dataset

In [None]:
import json
import pandas as pd

# Load dataset from JSONL file
data = []
with open("data.jsonl", "r") as file:
    for line in file:
        data.append(json.loads(line))  # Load each JSON object separately

# Convert to DataFrame
df = pd.DataFrame([item["data"] for item in data])

# Display general information
print(df.info())

# Show the first few rows
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8799 entries, 0 to 8798
Columns: 114 entries, approved_at_utc to link_flair_template_id
dtypes: bool(28), float64(7), int64(8), object(71)
memory usage: 6.0+ MB
None
  approved_at_utc  subreddit  \
0            None  Anarchism   
1            None  Anarchism   
2            None  Anarchism   
3            None  Anarchism   
4            None  Anarchism   

                                            selftext author_fullname  saved  \
0   What you are reading, watching, or listening ...        t2_6l4z3  False   
1                                                        t2_4vt6saq5  False   
2  I am an anarcho-nihilist and i am reading simi...   t2_1h2y206upy  False   
3  Ayo, im an anarchist but perhaps not the most ...     t2_83t23peb  False   
4  Looking for videos/podcasts/writing where folk...        t2_9haan  False   

  mod_reason_title  gilded  clicked  \
0             None       0    False   
1             None       0    False   

# Checks the percentage of the missing values

In [1]:
df.info()

NameError: name 'df' is not defined

In [None]:
print(df.isnull().sum().sort_values(ascending=False).head(20))


approved_at_utc        8799
mod_reason_by          8799
content_categories     8799
mod_note               8799
banned_by              8799
likes                  8799
banned_at_utc          8799
view_count             8799
removed_by             8799
num_reports            8799
approved_by            8799
top_awarded_type       8799
category               8799
report_reasons         8799
discussion_type        8799
mod_reason_title       8799
removed_by_category    8798
removal_reason         8798
distinguished          8776
author_cakeday         8774
dtype: int64


# Drops those with 100% missing values

In [None]:
df = df.drop(columns=[
    'approved_at_utc', 'mod_reason_by', 'content_categories', 'mod_note', 'banned_by',
    'likes', 'banned_at_utc', 'view_count', 'removed_by', 'num_reports', 'approved_by',
    'top_awarded_type', 'category', 'report_reasons', 'discussion_type', 'mod_reason_title'
])


# Dropping those with 99% of missing values

In [None]:
df = df.drop(columns=['removed_by_category', 'removal_reason', 'distinguished', 'author_cakeday'])


# Dropping other

In [None]:
df = df.drop(columns=[
    'gallery_data', 'is_gallery', 'media_metadata', 'crosspost_parent', 'crosspost_parent_list',
    'secure_media', 'media', 'author_flair_css_class', 'author_flair_background_color',
    'suggested_sort', 'author_flair_template_id', 'author_flair_text', 'author_flair_text_color',
    'selftext_html', 'link_flair_css_class', 'link_flair_template_id'
])


# Only keeping what is required

In [None]:
columns_to_keep = [
    "link_flair_text", "thumbnail_width", "thumbnail_height",
    "post_hint", "url_overridden_by_dest", 
    "over_18",
    "author_premium", "author_fullname"
]

# Drop all other columns
df = df[columns_to_keep]

In [None]:
print(df.isnull().sum().sort_values(ascending=False).head(20))


link_flair_text           2936
thumbnail_width           2632
thumbnail_height          2632
post_hint                 2560
url_overridden_by_dest    2059
author_premium             120
author_fullname            120
over_18                      0
dtype: int64


In [None]:
print(df.isnull().sum().sort_values(ascending=False).head(20))


link_flair_text           2936
thumbnail_width           2632
thumbnail_height          2632
post_hint                 2560
url_overridden_by_dest    2059
author_premium             120
author_fullname            120
over_18                      0
dtype: int64


# Preprocessing

- Filled **Unknown** for `link_flair_text`.  
- Filled with **Median values** in `thumbnail_width`, `thumbnail_height`.  
- Filled post_hint: If `media_only == True`, it's likely **"image"**, **"video"**, or **"gallery"** content.  
If `url_overridden_by_dest` is **NOT null**, it's likely a **"link"** post.  
Otherwise, assume **"text"** or **"selfpost"**.  
- `url_overridden_by_dest` is **not changed** because not having a URL just means that it doesn't refer to some other link.  
- `author_full_name` && `author_premium` All those rows with null values are dropped(since they are very less).


In [None]:
df['link_flair_text'].fillna('Unknown', inplace=True)
df['thumbnail_width'].fillna(df['thumbnail_width'].median(), inplace=True)
df['thumbnail_height'].fillna(df['thumbnail_height'].median(), inplace=True)
df['post_hint'] = df.apply(lambda row: 
    'image' if row['media_only'] == True else 
    'link' if pd.notna(row['url_overridden_by_dest']) else 
    'unknown', axis=1)
df = df.dropna(subset=['author_premium', 'author_fullname','media_only'])




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['link_flair_text'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['thumbnail_width'].fillna(df['thumbnail_width'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate ob

KeyError: 'media_only'

In [None]:
df = df.applymap(lambda x: str(x) if isinstance(x, list) else x)
df = df.drop_duplicates().reset_index(drop=True)


  df = df.applymap(lambda x: str(x) if isinstance(x, list) else x)


In [None]:
print(df.isnull().sum().sort_values(ascending=False).head(20))
df.info()

url_overridden_by_dest    1522
link_flair_text              0
thumbnail_width              0
thumbnail_height             0
post_hint                    0
over_18                      0
author_premium               0
author_fullname              0
media_only                   0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8205 entries, 0 to 8204
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   link_flair_text         8205 non-null   object 
 1   thumbnail_width         8205 non-null   float64
 2   thumbnail_height        8205 non-null   float64
 3   post_hint               8205 non-null   object 
 4   url_overridden_by_dest  6683 non-null   object 
 5   over_18                 8205 non-null   bool   
 6   author_premium          8205 non-null   bool   
 7   author_fullname         8205 non-null   object 
 8   media_only              8205 non-null   bool   
dtypes: bool(3), fl

In [None]:
df.to_csv("cleaned_reddit_data.csv", index=False)


# Feature description
- post_hint: Shows whether the post contains media(link) or not(unknown)

In [3]:
import pandas as pd

# Load Data
df = pd.read_csv("cleaned_reddit_data.csv")

# Get unique values of 'post_hint'
unique_post_hints = df["link_flair_text"].dropna().unique()

# Print unique values
print(len(unique_post_hints))
print(unique_post_hints)


136
['Unknown' 'New User' 'PDF' 'Tonight Jan 31st 6:00 PM EST' 'Meta'
 'Privacy: Instagram' 'Genocide' 'Lol' 'Flaired Users Only'
 'Satire - Flaired Users Only' 'Open Discussion'
 'Misleading Title - Flaired Users Only'
 ':snoo_feelsgoodman: Discussion :snoo_thoughtful:' ':snoo: Article'
 ':snoo_putback: Opinion' 'Low Karma' 'Covered by another article'
 '❌ Multiple user reports' 'US Politics' 'Political History'
 'US Elections' 'Legal/Courts' 'International Politics' 'Political Theory'
 'European Politics' 'Legislation' 'Non-US Politics' 'Discussion' 'News'
 'Breaking News' 'Satire ' 'Fake News' 'Satire' ':snoo: Article '
 '🌐 World News' ':snoo_joy: Meme' '📷 Pic' '🗳️ Beat Trump'
 ':snoo_dealwithit: Join r/democrats' '📺 Video'
 ':snoo_feelsgoodman: Healthcare' 'FOOD DESERTS' '📸 Album'
 ':pride: LGBTQ+' 'No Paywall' '✅ Accomplishment'
 ':snoo_thoughtful: Suggestion' '🌐 Foreign Policy' ':snoo_smile: Satire'
 '📉 Economy' 'Verified' 'do not brigade other subs' 'NO PAYWALL'
 ':snoo_smile: H

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8205 entries, 0 to 8204
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   link_flair_text         8205 non-null   object 
 1   thumbnail_width         8205 non-null   float64
 2   thumbnail_height        8205 non-null   float64
 3   post_hint               8205 non-null   object 
 4   url_overridden_by_dest  6683 non-null   object 
 5   over_18                 8205 non-null   bool   
 6   author_premium          8205 non-null   bool   
 7   author_fullname         8205 non-null   object 
 8   media_only              8205 non-null   bool   
dtypes: bool(3), float64(2), object(4)
memory usage: 408.8+ KB


In [11]:
import pandas as pd

# Load Data
df = pd.read_csv("cleaned_reddit_data.csv")

# Filter rows where both thumbnail_height and thumbnail_width are not null
filtered_df = df.dropna(subset=["thumbnail_height", "thumbnail_width"])

# Get unique values of 'link_flair_text' in the filtered data
unique_post_hints = filtered_df["link_flair_text"].dropna().unique()

# Print unique values
print(len(unique_post_hints))
print(unique_post_hints)


136
['Unknown' 'New User' 'PDF' 'Tonight Jan 31st 6:00 PM EST' 'Meta'
 'Privacy: Instagram' 'Genocide' 'Lol' 'Flaired Users Only'
 'Satire - Flaired Users Only' 'Open Discussion'
 'Misleading Title - Flaired Users Only'
 ':snoo_feelsgoodman: Discussion :snoo_thoughtful:' ':snoo: Article'
 ':snoo_putback: Opinion' 'Low Karma' 'Covered by another article'
 '❌ Multiple user reports' 'US Politics' 'Political History'
 'US Elections' 'Legal/Courts' 'International Politics' 'Political Theory'
 'European Politics' 'Legislation' 'Non-US Politics' 'Discussion' 'News'
 'Breaking News' 'Satire ' 'Fake News' 'Satire' ':snoo: Article '
 '🌐 World News' ':snoo_joy: Meme' '📷 Pic' '🗳️ Beat Trump'
 ':snoo_dealwithit: Join r/democrats' '📺 Video'
 ':snoo_feelsgoodman: Healthcare' 'FOOD DESERTS' '📸 Album'
 ':pride: LGBTQ+' 'No Paywall' '✅ Accomplishment'
 ':snoo_thoughtful: Suggestion' '🌐 Foreign Policy' ':snoo_smile: Satire'
 '📉 Economy' 'Verified' 'do not brigade other subs' 'NO PAYWALL'
 ':snoo_smile: H