#### 1. Import pandas library

In [17]:
import pandas as pd
import numpy as np

#### 2. Load data (users.csv, posts.csv)

In [2]:
users_df = pd.read_csv('users.csv')
posts_df = pd.read_csv('posts.csv')

In [3]:
users_df.head()

Unnamed: 0.1,Unnamed: 0,id,display_name,about_me,age,creation_date,last_access_date,location,reputation,up_votes,down_votes,views,profile_image_url,website_url
0,0,107658,ivordesign,,,2009-05-15 12:21:51.230000+00:00,2014-08-28 17:43:18.040000+00:00,United Kingdom,305,2,0,91,,http://www.ivorthedesigner.co.uk
1,1,218597,icabod,,,2009-11-25 13:30:42.460000+00:00,2020-05-29 11:44:18.140000+00:00,United Kingdom,6559,361,25,374,,http://icablog.org/
2,2,326360,rbaker86,<p>Software developer and technology enthusias...,,2010-04-26 21:18:19.857000+00:00,2017-12-04 16:49:29.297000+00:00,United Kingdom,1802,40,14,97,,http://na
3,3,379556,Rui Marques,<p>Engineer and Automation Architect at Vonage...,,2010-06-29 23:32:49.870000+00:00,2020-05-28 14:49:23.427000+00:00,United Kingdom,2335,125,10,261,,http://ruimarques.io
4,4,450456,Jules,,,2010-09-17 09:47:09.397000+00:00,2020-05-13 07:03:58.113000+00:00,United Kingdom,7746,256,50,3598,https://i.stack.imgur.com/4Y8Zf.jpg?s=128&g=1,https://www.julesmoorhouse.com


In [4]:
posts_df.head()

Unnamed: 0,id,owner_user_id,score,view_count,comment_count,favorite_count
0,30336926,,1,14,3,
1,36873524,,1,18,0,1.0
2,36605876,,1,19,0,
3,36718461,,1,17,0,
4,30434893,,0,15,0,


#### 3. Rename id column to user_id

In [5]:
users_df.rename(columns={'id':'user_id'}, inplace=True)

In [6]:
users_df.head(5)

Unnamed: 0.1,Unnamed: 0,user_id,display_name,about_me,age,creation_date,last_access_date,location,reputation,up_votes,down_votes,views,profile_image_url,website_url
0,0,107658,ivordesign,,,2009-05-15 12:21:51.230000+00:00,2014-08-28 17:43:18.040000+00:00,United Kingdom,305,2,0,91,,http://www.ivorthedesigner.co.uk
1,1,218597,icabod,,,2009-11-25 13:30:42.460000+00:00,2020-05-29 11:44:18.140000+00:00,United Kingdom,6559,361,25,374,,http://icablog.org/
2,2,326360,rbaker86,<p>Software developer and technology enthusias...,,2010-04-26 21:18:19.857000+00:00,2017-12-04 16:49:29.297000+00:00,United Kingdom,1802,40,14,97,,http://na
3,3,379556,Rui Marques,<p>Engineer and Automation Architect at Vonage...,,2010-06-29 23:32:49.870000+00:00,2020-05-28 14:49:23.427000+00:00,United Kingdom,2335,125,10,261,,http://ruimarques.io
4,4,450456,Jules,,,2010-09-17 09:47:09.397000+00:00,2020-05-13 07:03:58.113000+00:00,United Kingdom,7746,256,50,3598,https://i.stack.imgur.com/4Y8Zf.jpg?s=128&g=1,https://www.julesmoorhouse.com


#### 4. Rename id column to post_id and owner_user_id to user_id

In [7]:
posts_df.rename(columns={'id':'post_id', 'owner_user_id':'user_id'}, inplace=True)

In [8]:
len(posts_df['user_id'].unique())

223872

#### 5. Define new dataframes for users and posts with the following selected columns:
    **users columns**: user_id, reputation,views,up_votes,down_votes
    **posts columns**: post_id, score,user_id,view_count,comment_count

In [9]:
new_users_df = users_df.loc[:,['user_id','reputation','views','up_votes','down_votes']].copy()

In [10]:
new_posts_df = posts_df.loc[:,['post_id','score','user_id','view_count','comment_count']].copy()

#### 6. Merge both dataframes, users and posts. 
You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [11]:
merged_df = new_posts_df.merge(new_users_df, how='outer', left_on='user_id', right_on='user_id').copy()

In [12]:
# with outer merge we get all the rows from both dataframes
# even if they don't match user_id with each other
merged_df[merged_df['user_id'].isna() != True]

Unnamed: 0,post_id,score,user_id,view_count,comment_count,reputation,views,up_votes,down_votes
1242,28532353.0,1.0,4453293.0,18.0,0.0,,,,
1243,37342127.0,0.0,1876983.0,9.0,0.0,,,,
1244,36829997.0,1.0,1876983.0,16.0,1.0,,,,
1245,34535507.0,0.0,1876983.0,15.0,0.0,,,,
1246,34961481.0,0.0,1876983.0,18.0,0.0,,,,
...,...,...,...,...,...,...,...,...,...
339789,,,574332.0,,,41.0,6.0,10.0,0.0
339790,,,7155452.0,,,41.0,6.0,0.0,0.0
339791,,,5529001.0,,,41.0,11.0,0.0,0.0
339792,,,4105549.0,,,41.0,11.0,2.0,0.0


#### 7. How many missing values do you have in your merged dataframe? 

In [13]:
'''
By using outer merge we do not miss any rows from the original dataframes
'''

'\nBy using outer merge we do not miss any rows from the original dataframes\n'

In [14]:
# counting values NaN 
listcount = []
for i in merged_df.columns:
    listcount.append(merged_df[i].isna().sum())

In [15]:
print('Total number of NaN values (missing values) in the dataframe: ',sum(listcount))

Total number of NaN values (missing values) in the dataframe:  1356158


#### Bonus: Identify extreme values in your merged dataframe, create a dataframe called outliers with the same columns as our data set and calculate the bounds. The values of the outliers dataframe will be the values of the merged_df that fall outside that bounds. You will need to save your outliers dataframe to a csv file on your-code folder. Hint: post_id cannot have outliers!

In [33]:
# we will use 'Sigma clipping' to filter the oultiers
# first we need to have only matching rows, so we merge by inner this time
merged_df_2 = new_posts_df.merge(new_users_df, how='inner', left_on='user_id', right_on='user_id').copy()

In [75]:
# then we calcute the quartiles with numpy
outliers_quartiles = np.percentile(merged_df_2['up_votes'],[25,50,70])

In [76]:
outliers_quartiles

array([ 15.,  96., 321.])

In [77]:
# according to the statistical gaussian formula we create mu variable
mu = outliers_quartiles[1]

In [78]:
# and calculate the sigma
sig = 0.74 * (outliers_quartiles[2] - outliers_quartiles[0])

In [79]:
sig

226.44

In [80]:
merged_df_2

Unnamed: 0,post_id,score,user_id,view_count,comment_count,reputation,views,up_votes,down_votes
0,35445951,0,598427.0,16,0,580,100,13,1
1,33368053,0,4174195.0,13,8,3228,426,321,7
2,34204113,0,4174195.0,19,0,3228,426,321,7
3,37568247,0,4174195.0,15,2,3228,426,321,7
4,36286848,0,4174195.0,17,0,3228,426,321,7
...,...,...,...,...,...,...,...,...,...
1060,34719924,0,410636.0,16,0,42200,3858,1793,796
1061,36984968,0,6275471.0,14,0,27,12,0,0
1062,34813273,0,27754.0,16,0,1018,90,40,24
1063,35922834,0,401743.0,13,0,1024,83,9,2


In [82]:
# we filter using the following query.
# First we will filter the column up_votes 
merged_df_2 = merged_df_2.query('(up_votes > @mu - 5 * @sig) & (up_votes < @mu + 5 * @sig)')

In [83]:
# as we can see an aproximate hundred values have been filtered out from the dataframe
merged_df_2

Unnamed: 0,post_id,score,user_id,view_count,comment_count,reputation,views,up_votes,down_votes
0,35445951,0,598427.0,16,0,580,100,13,1
1,33368053,0,4174195.0,13,8,3228,426,321,7
2,34204113,0,4174195.0,19,0,3228,426,321,7
3,37568247,0,4174195.0,15,2,3228,426,321,7
4,36286848,0,4174195.0,17,0,3228,426,321,7
...,...,...,...,...,...,...,...,...,...
1059,16871531,0,1306351.0,19,2,1152,78,119,13
1061,36984968,0,6275471.0,14,0,27,12,0,0
1062,34813273,0,27754.0,16,0,1018,90,40,24
1063,35922834,0,401743.0,13,0,1024,83,9,2


In [85]:
# then we do the same on the down_votes column
outliers_quartiles = np.percentile(merged_df_2['down_votes'],[25,50,70])
mu = outliers_quartiles[1]
sig = 0.74 * (outliers_quartiles[2] - outliers_quartiles[0])
merged_df_2 = merged_df_2.query('(down_votes > @mu - 5 * @sig) & (down_votes < @mu + 5 * @sig)')

In [86]:
# another hundred or so are gone
merged_df_2

Unnamed: 0,post_id,score,user_id,view_count,comment_count,reputation,views,up_votes,down_votes
0,35445951,0,598427.0,16,0,580,100,13,1
1,33368053,0,4174195.0,13,8,3228,426,321,7
2,34204113,0,4174195.0,19,0,3228,426,321,7
3,37568247,0,4174195.0,15,2,3228,426,321,7
4,36286848,0,4174195.0,17,0,3228,426,321,7
...,...,...,...,...,...,...,...,...,...
1059,16871531,0,1306351.0,19,2,1152,78,119,13
1061,36984968,0,6275471.0,14,0,27,12,0,0
1062,34813273,0,27754.0,16,0,1018,90,40,24
1063,35922834,0,401743.0,13,0,1024,83,9,2


In [87]:
# if we look the top and bottom values for each we will see
# that now are more consistant and not so distant between each other
merged_df_2.nlargest(5,'up_votes')

Unnamed: 0,post_id,score,user_id,view_count,comment_count,reputation,views,up_votes,down_votes
1003,37482402,1,1700106.0,16,10,1877,216,1203,3
675,36648652,0,1125402.0,8,0,5903,381,1197,29
687,35563832,1,28901.0,19,0,16971,549,1188,29
703,33754832,1,199111.0,16,0,2800,121,1171,4
1034,30461043,0,637609.0,15,1,5206,567,1095,14


In [89]:
merged_df_2.nsmallest(5,'up_votes')

Unnamed: 0,post_id,score,user_id,view_count,comment_count,reputation,views,up_votes,down_votes
105,34927356,0,5191314.0,18,1,53,10,0,0
106,34451125,0,5191314.0,20,1,53,10,0,0
115,21867051,0,1939492.0,17,2,25,23,0,0
116,31038301,0,4295902.0,10,0,1,4,0,0
132,37577086,1,6357705.0,20,2,129,17,0,0


In [90]:
merged_df_2.nlargest(5,'down_votes')

Unnamed: 0,post_id,score,user_id,view_count,comment_count,reputation,views,up_votes,down_votes
456,37593146,0,1250422.0,18,0,1256,160,213,37
931,32612597,0,3433325.0,20,0,934,50,98,37
1050,28203036,1,17440.0,12,0,11127,702,144,36
1047,37721648,0,921224.0,7,0,4097,578,819,34
268,31974416,0,1127699.0,19,0,12155,1828,259,32


In [91]:
merged_df_2.nsmallest(5,'down_votes')

Unnamed: 0,post_id,score,user_id,view_count,comment_count,reputation,views,up_votes,down_votes
23,36854344,0,2748501.0,19,0,333,6,2,0
24,35073895,0,3813758.0,20,0,1115,46,15,0
25,35301617,0,3813758.0,17,0,1115,46,15,0
26,35813179,1,1134275.0,8,6,2038,213,48,0
52,36747702,0,6187855.0,18,0,92,10,2,0
