### Importing Reddit's API, PRAW to scrap subreddits, and Pandas

In [1]:
import praw
import pandas as pd

### Initialising the Reddit object with user's credentials

In [2]:
reddit = praw.Reddit(client_id = "*", 
                     client_secret = "*", 
                     user_agent = "*")

### Choosing the subreddit of r/india

In [3]:
subred = reddit.subreddit('india')

## Considering "hot", "top", "new" and "controversial" filters to scrap subreddits

In [4]:
hot = subred.hot(limit=3000)
top = subred.top(limit=3000)
controversial = subred.controversial(limit=3000)
new = subred.new(limit=3000)

In [5]:
type(hot)

praw.models.listing.generator.ListingGenerator

### Initilizing a dictionary "data" to store the scrapped data

In [6]:
data = {"id":[], "title":[], "url":[], "body":[], "flair":[]}

### Getting data from corresponding filtered subreddits and appending in "data"

In [7]:
for i in hot:
    
    data['id'].append(i.id)
    data['title'].append(i.title)
    data['url'].append(i.url)
    data['body'].append(i.selftext)    
    data['flair'].append(i.link_flair_text)    

In [8]:
for i in top:
    
    data['id'].append(i.id)
    data['title'].append(i.title)
    data['url'].append(i.url)
    data['body'].append(i.selftext)    
    data['flair'].append(i.link_flair_text)

In [9]:
for i in controversial:
    
    data['id'].append(i.id)
    data['title'].append(i.title)
    data['url'].append(i.url)
    data['body'].append(i.selftext)    
    data['flair'].append(i.link_flair_text)

In [10]:
for i in new:
    
    data['id'].append(i.id)
    data['title'].append(i.title)
    data['url'].append(i.url)
    data['body'].append(i.selftext)    
    data['flair'].append(i.link_flair_text)

### Converting the dictionary to a Pandas DataFrame

In [11]:
import pandas as pd
df = pd.DataFrame(data)

In [12]:
df.head()

Unnamed: 0,id,title,url,body,flair
0,g1zi21,Coronavirus (COVID-19) Megathread - News and U...,https://www.reddit.com/r/india/comments/g1zi21...,###[Covid-19 Fundraisers & Donation Links](htt...,Coronavirus
1,g4a2ux,Karnataka CM Yediyurappa defends Kumaraswamy s...,https://www.indiatoday.in/india/story/karnatak...,,Coronavirus
2,g4b8we,Pregnant woman in Bengaluru walks 5 km in sear...,https://www.thenewsminute.com/article/pregnant...,,Coronavirus
3,g4jmo6,"Nisha Jindal, with 10k FB fans, turns out to b...",https://timesofindia.indiatimes.com/india/nish...,,Non-Political
4,g4c5u3,Goa becomes coronavirus-free after last active...,https://www.timesnownews.com/india/article/goa...,,Coronavirus


In [13]:
df.shape

(3615, 5)

### Since subreddits can appear in multiple filters, the DF is sorted and duplicate elements are dropped.

In [14]:
df.sort_values("id", inplace = True)
df.drop_duplicates(subset ="id", inplace = True)

In [15]:
df.shape

(2845, 5)

### Analyzing the unique flairs and their frequencies from the collected data

In [19]:
df['flair'].value_counts()

Politics                          921
Non-Political                     771
Coronavirus                       417
AskIndia                          188
[R]eddiquette                      71
Policy/Economy                     59
Photography                        56
Sports                             39
Business/Finance                   38
Science/Technology                 34
Food                               22
Unverified                         19
Scheduled                          11
CAA-NRC                            11
Moderated                           7
Misleading                          6
CAA-NRC-NPR                         5
Policy                              4
Demonetization                      4
r/all                               3
Policy & Economy                    3
Entertainment                       3
/r/all                              2
AMA                                 2
AMA Concluded                       1
Goal Achieved!!!                    1
Lifehacks   

### Choosing the top 10 flairs from the scrapped list as the categories for the succeeding prediction task. 

In [22]:
top_flairs = ["Politics", "Non-Political", "Coronavirus", "AskIndia", "Policy/Economy", "[R]eddiquette", 
              "Photography", "Business/Finance", "Sports", "Science/Technology"]

df_top = df.loc[df['flair'].isin(top_flairs)]

In [23]:
df_top.shape

(2594, 5)

In [24]:
df_top.head()

Unnamed: 0,id,title,url,body,flair
2335,1vfqzr,Hrithik Roshan to tie the knot for the second ...,http://www.bollywoodmantra.com/news/hrithik-ro...,,Non-Political
2152,1vgb3k,"Girls of /r/India, would any of you be interes...",https://www.reddit.com/r/india/comments/1vgb3k...,.,Non-Political
2303,1vqu7l,Regarding the ongoing Kejru dramabaaz,https://www.reddit.com/r/india/comments/1vqu7l...,"I spoke with some people in Delhi , one of who...",Politics
2266,1w4xsp,A site that lists out scams by the BJP,http://bjpscams.com/,,Politics
2124,1wcra7,"Modi says India has no War Memorials, here's a...",http://www.truthofgujarat.com/modi-says-india-...,,Politics


### Scrapping again from the r/india subreddit for the above mentioned flairs with a limit of 100 for each flairs

In [25]:
reddit = praw.Reddit(client_id = "*", 
                     client_secret = "*", 
                     user_agent = "*")

subred = reddit.subreddit('india')
sample_data = {"id":[], "title":[], "url":[], "body":[], "flair":[]}

top_flairs = ["Politics", "Non-Political", "Coronavirus", "AskIndia", "Policy/Economy", "[R]eddiquette", 
              "Photography", "Business/Finance", "Sports", "Science/Technology"]

for flair in top_flairs:
  
  top_f = subred.search(flair, limit=100)
  
  for i in top_f:
    
    sample_data["id"].append(i.id)
    sample_data["title"].append(i.title)
    sample_data["url"].append(i.url)
    sample_data["body"].append(i.selftext)
    sample_data["flair"].append(flair)
    
sample = pd.DataFrame(sample_data)


In [26]:
sample.head()

Unnamed: 0,id,title,url,body,flair
0,g2ct57,A polite request to all Indians here,https://www.reddit.com/r/india/comments/g2ct57...,I don't know if it is the same situation in ot...,Politics
1,futac9,Pitting a community against a political party ...,https://www.reddit.com/r/india/comments/futac9...,First of all let me start by saying it was stu...,Politics
2,ff8sth,A new political party gave a full front page a...,https://i.redd.it/yjo9wpy38el41.jpg,,Politics
3,fpaj1w,Hit by backlash over posts on lack of medical ...,https://theprint.in/india/hit-by-backlash-over...,,Politics
4,fxs1vy,Politics in the time of corona: WB CM question...,https://www.timesnownews.com/india/article/pol...,,Politics


In [27]:
count = sample.groupby('flair')['id'].nunique()

print (count)

flair
AskIndia              100
Business/Finance      100
Coronavirus           100
Non-Political         100
Photography           100
Policy/Economy        100
Politics              100
Science/Technology    100
Sports                100
[R]eddiquette          18
Name: id, dtype: int64


### It is seen that "[R]eddiquette"  flair has less samples so, using the previously scrapped data and concatinating for    "[R]eddiquette" alone.

In [20]:
df_r = df[df['flair'] == '[R]eddiquette']
df_r.head()

Unnamed: 0,id,title,url,body,flair
1843,1xseo5,TIL. Lord Shiva encourages [r]itual rape prost...,https://www.reddit.com/r/india/comments/1xseo5...,"Wendy Doniger, has tried to bring forth many r...",[R]eddiquette
2295,1zf5ln,Modi - No [R]iots in Gujarat in last 10 years ...,https://www.reddit.com/r/india/comments/1zf5ln...,1) [Vadodara: Centre sends para-military help]...,[R]eddiquette
1789,217pd5,New Mod Announcement [R],https://www.reddit.com/r/india/comments/217pd5...,We are happy to announce the addition of /u/Aw...,[R]eddiquette
2525,254794,A 12-year-old happy child showing the flowe[r]...,https://pbs.twimg.com/media/BnIaNqtCUAAU8l7.jpg,,[R]eddiquette
1955,2c0sze,[R] Where is the Muslim outrage?,https://www.reddit.com/r/india/comments/2c0sze...,"For the past few weeks, I have been observing ...",[R]eddiquette


In [21]:
df_r.shape

(71, 5)

In [28]:
final = pd.concat([sample, df_r])

In [29]:
final.head()

Unnamed: 0,id,title,url,body,flair
0,g2ct57,A polite request to all Indians here,https://www.reddit.com/r/india/comments/g2ct57...,I don't know if it is the same situation in ot...,Politics
1,futac9,Pitting a community against a political party ...,https://www.reddit.com/r/india/comments/futac9...,First of all let me start by saying it was stu...,Politics
2,ff8sth,A new political party gave a full front page a...,https://i.redd.it/yjo9wpy38el41.jpg,,Politics
3,fpaj1w,Hit by backlash over posts on lack of medical ...,https://theprint.in/india/hit-by-backlash-over...,,Politics
4,fxs1vy,Politics in the time of corona: WB CM question...,https://www.timesnownews.com/india/article/pol...,,Politics


In [30]:
count = final.groupby('flair')['id'].nunique()

print (count)

flair
AskIndia              100
Business/Finance      100
Coronavirus           100
Non-Political         100
Photography           100
Policy/Economy        100
Politics              100
Science/Technology    100
Sports                100
[R]eddiquette          89
Name: id, dtype: int64



### Converting the DataFrame into a CSV file

In [33]:
final.to_csv('Data/reddit-top-flairs.csv', index=False)