# Collecting Data

## Top Publications

In [336]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import numpy as np
import itertools
import time

In [316]:
response = requests.get("https://toppubs.smedian.com/")
page = response.content
soup = BeautifulSoup(page, 'html.parser')

In [317]:
titles = soup.find_all("a", class_='heading link link--primary u-accentColor--hoverTextNormal u-displayInline')

In [318]:
publications = [i.text for i in titles][:10]

In [319]:
links = [i['href'] for i in titles][:10]

In [320]:
publications

['The Startup',
 'Mission.org',
 'Personal Growth',
 'HackerNoon.com',
 'Towards Data Science',
 'Startup Grind',
 'The Economist',
 'The Coinbase Blog',
 'Better Humans',
 'UX Collective']

In [321]:
links

['https://medium.com/swlh',
 'https://medium.com/the-mission',
 'https://medium.com/personal-growth',
 'https://medium.com/hackernoon',
 'https://medium.com/towards-data-science',
 'https://medium.com/startup-grind',
 'https://medium.com/the-economist',
 'https://medium.com/the-coinbase-blog',
 'https://medium.com/better-humans',
 'https://medium.com/user-experience-design-1']

In [322]:
urls = {i:j+'/archive/{0}/{1:02d}/{2:02d}' for i, j in zip(publications, links)}

In [323]:
urls

{'The Startup': 'https://medium.com/swlh/archive/{0}/{1:02d}/{2:02d}',
 'Mission.org': 'https://medium.com/the-mission/archive/{0}/{1:02d}/{2:02d}',
 'Personal Growth': 'https://medium.com/personal-growth/archive/{0}/{1:02d}/{2:02d}',
 'HackerNoon.com': 'https://medium.com/hackernoon/archive/{0}/{1:02d}/{2:02d}',
 'Towards Data Science': 'https://medium.com/towards-data-science/archive/{0}/{1:02d}/{2:02d}',
 'Startup Grind': 'https://medium.com/startup-grind/archive/{0}/{1:02d}/{2:02d}',
 'The Economist': 'https://medium.com/the-economist/archive/{0}/{1:02d}/{2:02d}',
 'The Coinbase Blog': 'https://medium.com/the-coinbase-blog/archive/{0}/{1:02d}/{2:02d}',
 'Better Humans': 'https://medium.com/better-humans/archive/{0}/{1:02d}/{2:02d}',
 'UX Collective': 'https://medium.com/user-experience-design-1/archive/{0}/{1:02d}/{2:02d}'}

Towards Data Science,The Economist,The Coinbase Blog, UX Collective have custom domian

In [324]:
urls['Towards Data Science'] = 'https://towardsdatascience.com/archive/{0}/{1:02d}/{2:02d}'
urls['The Economist'] = 'https://medium.economist.com/archive/{0}/{1:02d}/{2:02d}'
urls['The Coinbase Blog'] = 'https://blog.coinbase.com/archive/{0}/{1:02d}/{2:02d}'
urls['UX Collective'] = 'https://uxdesign.cc/archive/{0}/{1:02d}/{2:02d}'

In [326]:
urls

{'The Startup': 'https://medium.com/swlh/archive/{0}/{1:02d}/{2:02d}',
 'Mission.org': 'https://medium.com/the-mission/archive/{0}/{1:02d}/{2:02d}',
 'Personal Growth': 'https://medium.com/personal-growth/archive/{0}/{1:02d}/{2:02d}',
 'HackerNoon.com': 'https://medium.com/hackernoon/archive/{0}/{1:02d}/{2:02d}',
 'Towards Data Science': 'https://towardsdatascience.com/archive/{0}/{1:02d}/{2:02d}',
 'Startup Grind': 'https://medium.com/startup-grind/archive/{0}/{1:02d}/{2:02d}',
 'The Economist': 'https://medium.economist.com/archive/{0}/{1:02d}/{2:02d}',
 'The Coinbase Blog': 'https://blog.coinbase.com/archive/{0}/{1:02d}/{2:02d}',
 'Better Humans': 'https://medium.com/better-humans/archive/{0}/{1:02d}/{2:02d}',
 'UX Collective': 'https://uxdesign.cc/archive/{0}/{1:02d}/{2:02d}'}

## Data Extraction from medium

In [327]:
def randDates(start, end, n):
    start_u = start.value//10**9
    end_u = end.value//10**9

    return pd.DatetimeIndex((10**9*np.random.randint(start_u, end_u, n, dtype=np.int64)).view('M8[ns]')).date

In [328]:
start = pd.to_datetime('2020-01-01')
end = pd.to_datetime('2020-08-01')
dates = pd.to_datetime(pd.Series(randDates(start, end, n=50)))

In [329]:
len(dates)

50

### Dry Test

In [264]:
url = 'https://medium.com/swlh/archive/{0}/{1:02d}/{2:02d}'
year = 2020
month = dates.dt.month[0]
day = dates.dt.day[0]

In [266]:
month,day

(2, 14)

In [248]:
response = requests.get(url.format(year, month, day), allow_redirects=True)
page = response.content
soup = BeautifulSoup(page, 'html.parser')

In [249]:
year,month,day

(2020, 5, 21)

In [250]:
articles = soup.find_all("div", class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")

titles = [i.find('h3',class_="graf--title" ).text for i in articles]

sub_titles = [i.find("h4", class_="graf--subtitle").text if i.find("h4", class_="graf--subtitle") is not None else '' for i in articles]

article_link = [i.find_all('a')[3]['href'].split('?')[0] for i in articles]

claps = [0 if (k is None) or (k == '') or (k.split is None) else int(float(k.split('K')[0])*1000) if len(k.split('K'))==2 else int(float(k.split('K')[0])) for k in [j.text for j in [i.find_all('button')[1] for i in articles]]]

reading_time = [int(i.find("span", class_="readingTime")['title'].split()[0]) if i.find("span", class_="readingTime") is not None else 0 for i in articles]

responses = [i.find_all('a')[6].text.split(' ')[0] if (len(i.find_all('a'))==7) and len(i.find_all('a')[6].text.split(' '))!=0 else 0 for i in articles]

In [262]:
columns = ['Title', 'SubTitle', 'Link', 'Claps', 'Reading_Time', 'Responses']

In [261]:
pd.DataFrame(zip(titles, sub_titles, article_link, claps, reading_time, responses), columns=columns)

Unnamed: 0,Title,SubTitle,Link,Claps,Reading_Time,Responses
0,How AWS WAF saved us,Benefits of AWS WAF and how it helped us work ...,https://medium.com/swlh/how-aws-waf-saved-us-6...,455,8,4
1,Developing with Deno: Rest API,,https://medium.com/swlh/developing-with-deno-r...,170,5,1
2,Input-Output Metric Framework for Product Mana...,,https://medium.com/swlh/input-output-metric-fr...,217,6,1
3,"Retiring My Old Mac, I Look At The Surface Lin...",Maybe Microsoft is where it’s at now…,https://medium.com/swlh/retiring-my-old-mac-i-...,128,7,7
4,Is a Cannabis ETF The Best Play For 2020?,,https://medium.com/swlh/is-a-cannabis-etf-the-...,226,9,0
...,...,...,...,...,...,...
99,Playing with Virtual Time,,https://medium.com/swlh/playing-with-virtual-t...,50,4,0
100,Build a Culture Primed to Innovate,The Most Important Factor Might Not Be What Yo...,https://medium.com/swlh/build-a-culture-primed...,166,4,1
101,Migrating from Relay to Apollo in create-react...,"Hi, there! Here’s a stepwise guide to those wh...",https://medium.com/swlh/migrating-from-relay-t...,51,3,0
102,Unique Aspects of Terminating H-1B Employees,With the unfortunate reality of layoffs it is ...,https://medium.com/swlh/unique-aspects-of-term...,53,3,0


### Actual Extraction

#### P.S: Ignore the error here! Actually I did a mistake while parsing dates and had to stop inbetween which led to dates mismatch and data was partially collected. Now 38 Random dates data is available which makes 4066 articles data

In [350]:
year = 2020
titles = []
sub_titles = []
article_link = []
claps = []
reading_time = []
responses = []
pubs = []
for i in range(len(dates)):
    month = dates.dt.month[i]
    day = dates.dt.day[i]
    for publication, url in urls.items():
        response = requests.get(url.format(year, month, day), allow_redirects=True)
        if not response.url.startswith(url.format(year, month, day)):
            continue
        page = response.content
        soup = BeautifulSoup(page, 'html.parser')
        articles = soup.find_all("div", class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")
        
        number = len([i.find('h3',class_="graf--title" ).text if i.find('h3',class_="graf--title" ) is not None else '' for i in articles])
        
        titles.append([i.find('h3',class_="graf--title" ).text if i.find('h3',class_="graf--title" ) is not None else '' for i in articles])

        sub_titles.append([i.find("h4", class_="graf--subtitle").text if i.find("h4", class_="graf--subtitle") is not None else '' for i in articles])

        article_link.append([i.find_all('a')[3]['href'].split('?')[0] for i in articles])

        claps.append([0 if (k is None) or (k == '') or (k.split is None) else int(float(k.split('K')[0])*1000) if len(k.split('K'))==2 else int(float(k.split('K')[0])) for k in [j.text for j in [i.find_all('button')[1] for i in articles]]])

        reading_time.append([int(i.find("span", class_="readingTime")['title'].split()[0]) if i.find("span", class_="readingTime") is not None else 0 for i in articles])

        responses.append([i.find_all('a')[6].text.split(' ')[0] if (len(i.find_all('a'))==7) and len(i.find_all('a')[6].text.split(' '))!=0 else 0 for i in articles])
        
        pubs.append([publication]*number)
        
        time.sleep(1)

KeyError: 0

In [339]:
columns = ['Title', 'SubTitle', 'Link', 'Claps', 'Reading_Time', 'Responses', 'Publication']

In [340]:
titles = list(itertools.chain.from_iterable(titles))
sub_titles = list(itertools.chain.from_iterable(sub_titles))
article_link = list(itertools.chain.from_iterable(article_link))
claps = list(itertools.chain.from_iterable(claps))
reading_time = list(itertools.chain.from_iterable(reading_time))
responses = list(itertools.chain.from_iterable(responses))
pubs = list(itertools.chain.from_iterable(pubs))

In [359]:
pd.DataFrame(zip(titles, sub_titles, article_link, claps, reading_time, responses, pubs), columns=columns).to_csv('Part1.csv')

In [362]:
df = pd.read_csv('Part1.csv')

In [364]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4065 entries, 0 to 4064
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    4065 non-null   int64 
 1   Title         4059 non-null   object
 2   SubTitle      2604 non-null   object
 3   Link          4065 non-null   object
 4   Claps         4065 non-null   int64 
 5   Reading_Time  4065 non-null   int64 
 6   Responses     4065 non-null   int64 
 7   Publication   4065 non-null   object
dtypes: int64(4), object(4)
memory usage: 254.2+ KB
