#### 1. Import pandas library

In [1]:
import pandas as pd

#### 2. Load data (users.csv, posts.csv)

In [2]:
users = pd.read_csv('users.csv')

In [3]:
posts = pd.read_csv('posts.csv')

#### 3. Rename id column to user_id

In [4]:
users.rename(columns={'id':'user_id'}, inplace = True)

#### 4. Rename id column to post_id and owner_user_id to user_id

In [5]:
posts.rename(columns={'id':'post_id', 'owner_user_id': 'user_id'}, inplace = True)

#### 5. Define new dataframes for users and posts with the following selected columns:
    **users columns**: user_id, reputation,views,up_votes,down_votes
    **posts columns**: post_id, score,user_id,view_count,comment_count

In [6]:
users = users[['user_id', 'reputation', 'views', 'up_votes', 'down_votes']]
users.head()

Unnamed: 0,user_id,reputation,views,up_votes,down_votes
0,107658,305,91,2,0
1,218597,6559,374,361,25
2,326360,1802,97,40,14
3,379556,2335,261,125,10
4,450456,7746,3598,256,50


In [7]:
posts = posts[['post_id', 'score', 'user_id', 'view_count', 'comment_count']]
posts.head()

Unnamed: 0,post_id,score,user_id,view_count,comment_count
0,30336926,1,,14,3
1,36873524,1,,18,0
2,36605876,1,,19,0
3,36718461,1,,17,0
4,30434893,0,,15,0


#### 6. Merge both dataframes, users and posts. 
You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [8]:
merged_df = users.merge(posts, left_on='user_id', right_on='user_id')

In [9]:
merged_df

Unnamed: 0,user_id,reputation,views,up_votes,down_votes,post_id,score,view_count,comment_count
0,450456,7746,3598,256,50,34989087,0,19,0
1,450456,7746,3598,256,50,4142174,0,15,0
2,1517244,4172,224,1362,32,37211628,2,18,0
3,1870509,897,105,86,2,30260248,1,11,4
4,101719,4055,332,174,16,34528617,0,18,0
...,...,...,...,...,...,...,...,...,...
1060,4295902,1,4,0,0,31038301,0,10,0
1061,2856201,11,21,0,0,19348144,0,19,2
1062,5882818,11,3,0,0,35199880,0,20,2
1063,6241297,13,2,0,0,36798683,0,12,0


#### 7. How many missing values do you have in your merged dataframe? 

In [10]:
merged_df.isna().sum()

user_id          0
reputation       0
views            0
up_votes         0
down_votes       0
post_id          0
score            0
view_count       0
comment_count    0
dtype: int64

In [11]:
# There are no missing values

There are no missing values

#### Bonus: Identify extreme values in your merged dataframe, create a dataframe called outliers with the same columns as our data set and calculate the bounds. The values of the outliers dataframe will be the values of the merged_df that fall outside that bounds. You will need to save your outliers dataframe to a csv file on your-code folder. Hint: post_id cannot have outliers!

In [13]:
# Outlier are the values that:
# value < Q1 - 1.5*IQR
# value > Q3 + 1.5*IQR

# IQR = (median of first half of the data) - (median second half) = Q3-Q1
# Q1 = 25%
# Q3 = 75%

In [16]:
stats = merged_df.describe().transpose()

stats['IQR'] = stats['75%'] - stats['25%']

display(stats)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,IQR
user_id,1065.0,2280169.0,1833959.0,274.0,653292.0,1996838.0,3545704.0,6449207.0,2892412.0
reputation,1065.0,7251.177,24021.61,1.0,246.0,1145.0,5206.0,287170.0,4960.0
views,1065.0,615.2451,1497.408,0.0,39.0,145.0,496.0,23103.0,457.0
up_votes,1065.0,397.6883,747.6289,0.0,15.0,96.0,375.0,7886.0,360.0
down_votes,1065.0,49.41502,177.3088,0.0,0.0,3.0,20.0,2692.0,20.0
post_id,1065.0,33988460.0,4495709.0,4142174.0,32891107.0,35563172.0,36991120.0,37769864.0,4100013.0
score,1065.0,0.1239437,0.5781415,-7.0,0.0,0.0,0.0,3.0,0.0
view_count,1065.0,15.60657,3.711159,3.0,13.0,17.0,19.0,20.0,6.0
comment_count,1065.0,0.9267606,1.627503,0.0,0.0,0.0,1.0,11.0,1.0


In [18]:
# Create the outliers dataframe with the same columns as the merged dataframe

outliers = pd.DataFrame(columns = merged_df.columns)

In [26]:
# Obtain the outliers for each column (index in stats DataFrame) and save them in results
for col in stats.index:
    
    #I use at instead of loc because I only need to get a single value from the DataFrame
    iqr = stats.at[col, 'IQR'] 
    
    cutoff = iqr*1.5
    
    # Bounds:
    lower = stats.at[col, '25%'] - cutoff
    upper = stats.at[col, '75%'] + cutoff
    
    results = merged_df[ (merged_df[col]<lower) | (merged_df[col] > upper) ].copy()
    
    # The outlier column indicates from what column the outlier is coming from
    results['outlier'] = col
    
    outliers = outliers.append(results)

In [27]:
# Save to a csv
outliers.to_csv('outliers.csv')