# Data Cleaning 

#### 1. Import pandas library.

In [1]:
import pandas as pd

#### 2. Import pymysql and sqlalchemy as you have learnt in the lesson of importing/exporting data. 


In [17]:
import pymysql
from sqlalchemy import create_engine

#### 3. Create a mysql engine to set the connection to the server. Check the connection details in [this link](https://relational.fit.cvut.cz/dataset/Stats).

In [39]:
engine = create_engine('mysql+pymysql://guest:relational@relational.fit.cvut.cz:3306/stats')

connection = engine.connect()

#### 4. Import the users table.

In [73]:
users = connection.execute('SELECT * FROM users')


In [74]:
users = pd.read_sql("SELECT * FROM users", engine)

#### 5. Rename Id column to userId.

In [76]:
users.columns = ['userId', 'Reputation', 'CreationDate', 'DisplayName', 'LastAccessDate',
       'WebsiteUrl', 'Location', 'AboutMe', 'Views', 'UpVotes', 'DownVotes',
       'AccountId', 'Age', 'ProfileImageUrl']

In [75]:
users.columns

Index(['Id', 'Reputation', 'CreationDate', 'DisplayName', 'LastAccessDate',
       'WebsiteUrl', 'Location', 'AboutMe', 'Views', 'UpVotes', 'DownVotes',
       'AccountId', 'Age', 'ProfileImageUrl'],
      dtype='object')

#### 6. Import the posts table. 

In [86]:
posts = pd.read_sql("SELECT * FROM posts", engine)

In [87]:
posts.columns

Index(['Id', 'PostTypeId', 'AcceptedAnswerId', 'CreaionDate', 'Score',
       'ViewCount', 'Body', 'OwnerUserId', 'LasActivityDate', 'Title', 'Tags',
       'AnswerCount', 'CommentCount', 'FavoriteCount', 'LastEditorUserId',
       'LastEditDate', 'CommunityOwnedDate', 'ParentId', 'ClosedDate',
       'OwnerDisplayName', 'LastEditorDisplayName'],
      dtype='object')

#### 7. Rename Id column to postId and OwnerUserId to userId.

In [91]:
posts.columns= ['postId', 'PostTypeId', 'AcceptedAnswerId', 'CreaionDate', 'Score',
       'ViewCount', 'Body', 'userId', 'LasActivityDate', 'Title', 'Tags',
       'AnswerCount', 'CommentCount', 'FavoriteCount', 'LastEditorUserId',
       'LastEditDate', 'CommunityOwnedDate', 'ParentId', 'ClosedDate',
       'OwnerDisplayName', 'LastEditorDisplayName']

In [92]:
posts.columns

Index(['postId', 'PostTypeId', 'AcceptedAnswerId', 'CreaionDate', 'Score',
       'ViewCount', 'Body', 'userId', 'LasActivityDate', 'Title', 'Tags',
       'AnswerCount', 'CommentCount', 'FavoriteCount', 'LastEditorUserId',
       'LastEditDate', 'CommunityOwnedDate', 'ParentId', 'ClosedDate',
       'OwnerDisplayName', 'LastEditorDisplayName'],
      dtype='object')

#### 8. Define new dataframes for users and posts with the following selected columns:
**users columns**: userId, Reputation, Views, UpVotes, DownVotes  
**posts columns**: postId, Score, userID, ViewCount, CommentCount

In [113]:
df_users = users[['userId', 'Reputation', 'Views', 'UpVotes', 'DownVotes']]
df_posts = posts[['postId', 'Score', 'userId', 'ViewCount', 'CommentCount']]

#### 9. Merge the new dataframes you have created, of users and posts. 
You will need to make an inner [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes.

In [120]:
new_df = pd.merge(df_posts, df_users, how='inner', on='userId')

In [122]:
new_df.columns

Index(['postId', 'Score', 'userId', 'ViewCount', 'CommentCount', 'Reputation',
       'Views', 'UpVotes', 'DownVotes'],
      dtype='object')

#### 10. How many missing values do you have in your merged dataframe? On which columns?

In [123]:
new_df.isna().sum()

postId              0
Score               0
userId              0
ViewCount       48396
CommentCount        0
Reputation          0
Views               0
UpVotes             0
DownVotes           0
dtype: int64

#### 11. You will need to make something with missing values.  Will you clean or filling them? Explain. 
**Remember** to check the results of your code before going to the next step.

In [124]:
new_df.isna().sum()/len(new_df)
# 53% missing, i would drop the column
new_df.drop('ViewCount', axis=1, inplace=True)

#### 12. Adjust the data types in order to avoid future issues. Which ones should be changed? 

In [125]:
new_df.set_index('postId')

Unnamed: 0_level_0,Score,userId,CommentCount,Reputation,Views,UpVotes,DownVotes
postId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,23,8.0,1,6764,1089,604,25
16,16,8.0,3,6764,1089,604,25
36,41,8.0,7,6764,1089,604,25
65,14,8.0,3,6764,1089,604,25
78,33,8.0,4,6764,1089,604,25
...,...,...,...,...,...,...,...
115366,1,55742.0,0,6,0,0,0
115370,1,55744.0,2,6,1,0,0
115371,0,35801.0,0,1,1,0,0
115375,0,49365.0,0,1,0,0,0
