#### 1. Import pandas library

In [1]:
import pandas as pd

#### 2. Load data (users.csv, posts.csv)

In [2]:
users= pd.read_csv('users.csv')

In [3]:
posts= pd.read_csv('posts.csv')

#### 3. Rename id column to user_id

In [4]:
users = users.rename({'id':"user_id"}, axis='columns')
users.columns

Index(['Unnamed: 0', 'user_id', 'display_name', 'about_me', 'age',
       'creation_date', 'last_access_date', 'location', 'reputation',
       'up_votes', 'down_votes', 'views', 'profile_image_url', 'website_url'],
      dtype='object')

#### 4. Rename id column to post_id and owner_user_id to user_id

In [5]:
posts.rename({'id':"post_id", 'owner_user_id':'user_id'}, axis='columns', inplace=True)
posts.columns

Index(['post_id', 'user_id', 'score', 'view_count', 'comment_count',
       'favorite_count'],
      dtype='object')

#### 5. Define new dataframes for users and posts with the following selected columns:
    **users columns**: user_id, reputation,views,up_votes,down_votes
    **posts columns**: post_id, score,user_id,view_count,comment_count

In [6]:
new_users=users[['user_id','reputation','views','up_votes','down_votes']].copy()
new_users


Unnamed: 0,user_id,reputation,views,up_votes,down_votes
0,107658,305,91,2,0
1,218597,6559,374,361,25
2,326360,1802,97,40,14
3,379556,2335,261,125,10
4,450456,7746,3598,256,50
...,...,...,...,...,...
14843,574332,41,6,10,0
14844,7155452,41,6,0,0
14845,5529001,41,11,0,0
14846,4105549,41,11,2,0


In [7]:
new_posts=posts[['post_id','score','user_id','view_count','comment_count']].copy()
new_posts

Unnamed: 0,post_id,score,user_id,view_count,comment_count
0,30336926,1,,14,3
1,36873524,1,,18,0
2,36605876,1,,19,0
3,36718461,1,,17,0
4,30434893,0,,15,0
...,...,...,...,...,...
325605,37142382,0,6315949.0,15,0
325606,36884286,0,3733872.0,20,1
325607,36140188,0,5216651.0,12,0
325608,36293699,1,4811040.0,20,1


#### 6. Merge both dataframes, users and posts. 
You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [8]:
#left_marge= new_users.merge(new_posts, left_on='user_id', right_on='user_id')
left_marge= new_users.merge(new_posts, how='left', on='user_id')
#aquí tenemos todos los nuevos usuarios (si han escrito post o no) y todos los posts de estos (puede ser que un usuario tenga más de 1 post)
left_marge

Unnamed: 0,user_id,reputation,views,up_votes,down_votes,post_id,score,view_count,comment_count
0,107658,305,91,2,0,,,,
1,218597,6559,374,361,25,,,,
2,326360,1802,97,40,14,,,,
3,379556,2335,261,125,10,,,,
4,450456,7746,3598,256,50,34989087.0,0.0,19.0,0.0
...,...,...,...,...,...,...,...,...,...
15244,574332,41,6,10,0,,,,
15245,7155452,41,6,0,0,,,,
15246,5529001,41,11,0,0,,,,
15247,4105549,41,11,2,0,,,,


In [9]:
inner_merge = new_users.merge(new_posts, how='inner', on='user_id')
#aquí tenemos sólo los nuevos usuarios que han escrito un post
inner_merge

Unnamed: 0,user_id,reputation,views,up_votes,down_votes,post_id,score,view_count,comment_count
0,450456,7746,3598,256,50,34989087,0,19,0
1,450456,7746,3598,256,50,4142174,0,15,0
2,1517244,4172,224,1362,32,37211628,2,18,0
3,1870509,897,105,86,2,30260248,1,11,4
4,101719,4055,332,174,16,34528617,0,18,0
...,...,...,...,...,...,...,...,...,...
1060,4295902,1,4,0,0,31038301,0,10,0
1061,2856201,11,21,0,0,19348144,0,19,2
1062,5882818,11,3,0,0,35199880,0,20,2
1063,6241297,13,2,0,0,36798683,0,12,0


#### 7. How many missing values do you have in your merged dataframe? 

In [10]:
left_marge.isna().sum()

user_id              0
reputation           0
views                0
up_votes             0
down_votes           0
post_id          14184
score            14184
view_count       14184
comment_count    14184
dtype: int64

In [11]:
inner_merge.isna().sum()

user_id          0
reputation       0
views            0
up_votes         0
down_votes       0
post_id          0
score            0
view_count       0
comment_count    0
dtype: int64

There are no missing values

#### Bonus: Identify extreme values in your merged dataframe, create a dataframe called outliers with the same columns as our data set and calculate the bounds. The values of the outliers dataframe will be the values of the merged_df that fall outside that bounds. You will need to save your outliers dataframe to a csv file on your-code folder. Hint: post_id cannot have outliers!

In [40]:
#https://towardsdatascience.com/ways-to-detect-and-remove-the-outliers-404d16608dba


#creo funcion que calcula intercuartil que me servirá para determinar los valores extremos.
def IQR_function(x):
    IQR=x.quantile(0.75)-x.quantile(0.25)
    return IQR

#data frame equivalente a bounds => dataframe de intercuartiles
df_IQR=pd.DataFrame(columns=inner_merge.columns)

for i in df_IQR.columns:
    if (i == 'user_id') or (i =='post_id'):
        pass
    else:
        df_IQR[i]=[IQR_function(inner_merge[i])]


        



Unnamed: 0,user_id,reputation,views,up_votes,down_votes,post_id,score,view_count,comment_count
0,,4960.0,457.0,360.0,20.0,,0.0,6.0,1.0


In [91]:
#creo dataframe a partir de "inner_merge" y fórmula IQR + (valor < (Q1 - 1.5 * IQR)) |(valor > (Q3 + 1.5 * IQR))
df_outliers=pd.DataFrame(columns=inner_merge.columns)

for c in df_outliers.columns:
    Q1=inner_merge[c].quantile(0.25)
    Q3=inner_merge[c].quantile(0.75)
    for i in inner_merge.index:
        if (inner_merge.at[i,c] < (Q1 - 1.5 * df_IQR.at[0,c])) or (inner_merge.at[i,c] > (Q3 + 1.5 * df_IQR.at[0,c])):
            df_outliers.at[i,c]=inner_merge.at[i,c]
        else:
            if (c == 'user_id') or (c =='post_id'):
                df_outliers.at[i,c]=inner_merge.at[i,c]
            else:
                pass
                    

#creo un indice multiple para vaciar las filas vacias
df_outliers.set_index(["user_id","post_id"],inplace=True)

#vacio las filas vacias                
df_outliers.dropna(axis=0, how='all', inplace=True)

#devuelvo indices a columnas
df_outliers.reset_index(inplace=True)

#compruebo df
df_outliers


Unnamed: 0,user_id,post_id,reputation,views,up_votes,down_votes,score,view_count,comment_count
0,450456,34989087,,3598,,,,,
1,450456,4142174,,3598,,,,,
2,1517244,37211628,,,1362,,2,,
3,1870509,30260248,,,,,1,,4
4,421039,37075759,,,,,1,,
...,...,...,...,...,...,...,...,...,...
497,3348196,35319646,,,,,-1,,
498,3348196,35686303,,,,,-1,,
499,4288595,36861243,,,,,1,,
500,1551819,35987581,,,,,,,8


In [93]:
df_outliers.to_csv('outliers_values.csv',index=False)