# Data processing and cleaning

### 1. Environment setup - installing libraries, modules and initialization of Spark Session

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructType, StructField, BooleanType, TimestampType
import pyspark.sql.functions as f

spark = SparkSession.builder \
    .appName('DataFrame') \
    .master('local[*]') \
    .getOrCreate()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
%%configure -f
{"conf": {
    "spark.jars.packages": "com.databricks:spark-xml_2.12:0.16.0",
    "spark.jars": "https://repo1.maven.org/maven2/com/databricks/spark-xml_2.12/0.16.0/spark-xml_2.12-0.16.0.jar"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3,application_1685028291765_0004,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3,application_1685028291765_0004,pyspark,idle,Link,Link,,✔


### 2. Creating DataFrames and imposing a data schema

#### a) Badges

In [11]:
schema_badges = StructType([StructField("_Class", IntegerType()),\
                            StructField("_Date", TimestampType()),\
                            StructField("_Id", IntegerType()),\
                            StructField("_Name", StringType()),\
                            StructField("_TagBased", BooleanType()),\
                            StructField("_UserId", IntegerType())])

Badges_df = spark.read.format("com.databricks.spark.xml")\
 .option("rowTag", "row")\
 .option("charset", "UTF8")\
 .schema(schema_badges)\
 .option("treatEmptyValuesAsNulls", "true")\
 .load("s3://bigdata-project-data-us-east-1/Badges.xml")

Badges_df.printSchema()
Badges_df.show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- _Class: integer (nullable = true)
 |-- _Date: timestamp (nullable = true)
 |-- _Id: integer (nullable = true)
 |-- _Name: string (nullable = true)
 |-- _TagBased: boolean (nullable = true)
 |-- _UserId: integer (nullable = true)

+------+-----------------------+---+--------------+---------+-------+
|_Class|_Date                  |_Id|_Name         |_TagBased|_UserId|
+------+-----------------------+---+--------------+---------+-------+
|3     |2016-01-12 18:44:49.267|2  |Autobiographer|false    |23     |
|3     |2016-01-12 18:44:49.267|3  |Autobiographer|false    |22     |
|3     |2016-01-12 18:44:49.267|4  |Autobiographer|false    |21     |
|3     |2016-01-12 18:44:49.267|5  |Autobiographer|false    |20     |
|3     |2016-01-12 18:44:49.267|6  |Autobiographer|false    |19     |
|3     |2016-01-12 18:44:49.267|7  |Autobiographer|false    |18     |
|3     |2016-01-12 18:44:49.267|8  |Autobiographer|false    |17     |
|3     |2016-01-12 18:44:49.267|9  |Autobiographer|false    

#### b) Comments

In [27]:
schema_comments = StructType([StructField("_ContentLicense", StringType()),\
                            StructField("_CreationDate", TimestampType()),\
                            StructField("_Id", IntegerType()),\
                            StructField("_PostId", IntegerType()),\
                            StructField("_Score", IntegerType()),\
                            StructField("_Text", StringType()),\
                            StructField("_UserDisplayName", StringType()),\
                            StructField("_UserId", IntegerType())])

Comments_df = spark.read.format("com.databricks.spark.xml")\
 .option("rowTag", "row")\
 .option("charset", "UTF8")\
 .schema(schema_comments)\
 .option("treatEmptyValuesAsNulls", "true")\
 .load("s3://bigdata-project-data-us-east-1/Comments.xml")

Comments_df.printSchema()
Comments_df.show(3, vertical=True, truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- _ContentLicense: string (nullable = true)
 |-- _CreationDate: timestamp (nullable = true)
 |-- _Id: integer (nullable = true)
 |-- _PostId: integer (nullable = true)
 |-- _Score: integer (nullable = true)
 |-- _Text: string (nullable = true)
 |-- _UserDisplayName: string (nullable = true)
 |-- _UserId: integer (nullable = true)

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 _ContentLicense  | CC BY-SA 3.0                                                                                                                                       

#### c) PostHistory

In [7]:
schema_posthistory = StructType([StructField("_Comment", StringType()),\
                            StructField("_ContentLicense", StringType()),\
                            StructField("_CreationDate", TimestampType()),\
                            StructField("_Id", IntegerType()),\
                            StructField("_PostHistoryTypeId", IntegerType()),\
                            StructField("_PostId", IntegerType()),\
                            StructField("_RevisionGUID", StringType()),\
                            StructField("_Text", StringType()),\
                            StructField("_UserDisplayName", StringType()),\
                            StructField("_UserId", IntegerType())])

PostHistory_df = spark.read.format("com.databricks.spark.xml")\
 .option("rowTag", "row")\
 .option("charset", "UTF8")\
 .schema(schema_posthistory)\
 .option("treatEmptyValuesAsNulls", "true")\
 .load("s3://bigdata-project-data-us-east-1/PostHistory.xml")

PostHistory_df.printSchema()
PostHistory_df.show(3, vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- _Comment: string (nullable = true)
 |-- _ContentLicense: string (nullable = true)
 |-- _CreationDate: timestamp (nullable = true)
 |-- _Id: integer (nullable = true)
 |-- _PostHistoryTypeId: integer (nullable = true)
 |-- _PostId: integer (nullable = true)
 |-- _RevisionGUID: string (nullable = true)
 |-- _Text: string (nullable = true)
 |-- _UserDisplayName: string (nullable = true)
 |-- _UserId: integer (nullable = true)

-RECORD 0----------------------------------
 _Comment           | null                 
 _ContentLicense    | CC BY-SA 3.0         
 _CreationDate      | 2016-01-12 18:45:... 
 _Id                | 1                    
 _PostHistoryTypeId | 2                    
 _PostId            | 1                    
 _RevisionGUID      | 6deffe8b-79c7-467... 
 _Text              | When I've printed... 
 _UserDisplayName   | null                 
 _UserId            | 16                   
-RECORD 1----------------------------------
 _Comment           | null        

#### d) PostLinks

In [8]:
schema_postlinks = StructType([StructField("_CreationDate", TimestampType()),\
                            StructField("_Id", IntegerType()),\
                            StructField("_LinkTypeId", IntegerType()),\
                            StructField("_PostId", IntegerType()),\
                            StructField("_RelatedPostId", IntegerType())])

PostLinks_df = spark.read.format("com.databricks.spark.xml")\
 .option("rowTag", "row")\
 .option("charset", "UTF8")\
 .schema(schema_postlinks)\
 .option("treatEmptyValuesAsNulls", "true")\
 .load("s3://bigdata-project-data-us-east-1/PostLinks.xml")

PostLinks_df.printSchema()
PostLinks_df.show(3, vertical=True, truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- _CreationDate: timestamp (nullable = true)
 |-- _Id: integer (nullable = true)
 |-- _LinkTypeId: integer (nullable = true)
 |-- _PostId: integer (nullable = true)
 |-- _RelatedPostId: integer (nullable = true)

-RECORD 0---------------------------------
 _CreationDate  | 2016-01-12 20:16:38.26  
 _Id            | 33                      
 _LinkTypeId    | 1                       
 _PostId        | 49                      
 _RelatedPostId | 2                       
-RECORD 1---------------------------------
 _CreationDate  | 2016-01-12 20:49:28.873 
 _Id            | 52                      
 _LinkTypeId    | 1                       
 _PostId        | 65                      
 _RelatedPostId | 20                      
-RECORD 2---------------------------------
 _CreationDate  | 2016-01-13 01:52:20.523 
 _Id            | 216                     
 _LinkTypeId    | 1                       
 _PostId        | 81                      
 _RelatedPostId | 49                      
only 

#### e) Posts

In [9]:
schema_posts = StructType([StructField("_AcceptedAnswerId", IntegerType()),\
                            StructField("_AnswerCount", IntegerType()),\
                            StructField("_Body", StringType()),\
                            StructField("_ClosedDate", TimestampType()),\
                            StructField("_CommentCount", IntegerType()),\
                            StructField("_CommunityOwnedDate", TimestampType()),\
                            StructField("_ContentLicense", StringType()),\
                            StructField("_CreationDate", TimestampType()),\
                            StructField("_FavoriteCount", IntegerType()),\
                            StructField("_Id", IntegerType()),\
                            StructField("_LastActivityDate", TimestampType()),\
                            StructField("_LastEditDate", TimestampType()),\
                            StructField("_LastEditorDisplayName", StringType()), 
                            StructField("_LastEditorUserId", IntegerType()),\
                            StructField("_OwnerDisplayName", StringType()),\
                            StructField("_OwnerUserId", IntegerType()),\
                            StructField("_ParentId", IntegerType()),\
                            StructField("_PostTypeId", IntegerType()),\
                            StructField("_Score", IntegerType()),\
                            StructField("_Tags", StringType()),\
                            StructField("_Title", StringType()),\
                            StructField("_ViewCount", IntegerType())])

Posts_df = spark.read.format("com.databricks.spark.xml")\
 .option("rowTag", "row")\
 .option("charset", "UTF8")\
 .schema(schema_posts)\
 .option("treatEmptyValuesAsNulls", "true")\
 .load("s3://bigdata-project-data-us-east-1/Posts.xml")

Posts_df.printSchema()
Posts_df.show(3, vertical=True, truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- _AcceptedAnswerId: integer (nullable = true)
 |-- _AnswerCount: integer (nullable = true)
 |-- _Body: string (nullable = true)
 |-- _ClosedDate: timestamp (nullable = true)
 |-- _CommentCount: integer (nullable = true)
 |-- _CommunityOwnedDate: timestamp (nullable = true)
 |-- _ContentLicense: string (nullable = true)
 |-- _CreationDate: timestamp (nullable = true)
 |-- _FavoriteCount: integer (nullable = true)
 |-- _Id: integer (nullable = true)
 |-- _LastActivityDate: timestamp (nullable = true)
 |-- _LastEditDate: timestamp (nullable = true)
 |-- _LastEditorDisplayName: string (nullable = true)
 |-- _LastEditorUserId: integer (nullable = true)
 |-- _OwnerDisplayName: string (nullable = true)
 |-- _OwnerUserId: integer (nullable = true)
 |-- _ParentId: integer (nullable = true)
 |-- _PostTypeId: integer (nullable = true)
 |-- _Score: integer (nullable = true)
 |-- _Tags: string (nullable = true)
 |-- _Title: string (nullable = true)
 |-- _ViewCount: integer (nullable = true

#### f) Tags

In [12]:
schema_tags = StructType([StructField("_Count", IntegerType()),\
                            StructField("_ExcerptPostId", IntegerType()),\
                            StructField("_CreationDate", TimestampType()),\
                            StructField("_Id", IntegerType()),\
                            StructField("_TagName", StringType()),\
                            StructField("_WikiPostId", IntegerType())])

Tags_df = spark.read.format("com.databricks.spark.xml")\
 .option("rowTag", "row")\
 .option("charset", "UTF8")\
 .schema(schema_tags)\
 .option("treatEmptyValuesAsNulls", "true")\
 .load("s3://bigdata-project-data-us-east-1/Tags.xml")

Tags_df.printSchema()
Tags_df.show(3, vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- _Count: integer (nullable = true)
 |-- _ExcerptPostId: integer (nullable = true)
 |-- _CreationDate: timestamp (nullable = true)
 |-- _Id: integer (nullable = true)
 |-- _TagName: string (nullable = true)
 |-- _WikiPostId: integer (nullable = true)

-RECORD 0--------------------
 _Count         | 27         
 _ExcerptPostId | 434        
 _CreationDate  | null       
 _Id            | 1          
 _TagName       | resolution 
 _WikiPostId    | 433        
-RECORD 1--------------------
 _Count         | 48         
 _ExcerptPostId | 112        
 _CreationDate  | null       
 _Id            | 2          
 _TagName       | speed      
 _WikiPostId    | 111        
-RECORD 2--------------------
 _Count         | 38         
 _ExcerptPostId | 114        
 _CreationDate  | null       
 _Id            | 3          
 _TagName       | quality    
 _WikiPostId    | 113        
only showing top 3 rows

#### g) Users 

In [13]:
schema_users = StructType([StructField("_AboutMe", StringType()),\
                            StructField("_AccountId", IntegerType()),\
                            StructField("_CreationDate", TimestampType()) \
                            StructField("_DisplayName", StringType()),\
                            StructField("_DownVotes", IntegerType()),\
                            StructField("_Id", IntegerType()),\
                            StructField("_LastAccessDate", TimestampType()),\
                            StructField("_Location", StringType()),\
                            StructField("_Reputation", IntegerType()),\
                            StructField("_UpVotes", IntegerType()),\
                            StructField("_Views", IntegerType()),\
                            StructField("_WebsiteUrl", StringType())])

Users_df = spark.read.format("com.databricks.spark.xml")\
 .option("rowTag", "row")\
 .option("charset", "UTF8")\
 .schema(schema_users)\
 .option("treatEmptyValuesAsNulls", "true")\
 .load("s3://bigdata-project-data-us-east-1/Users.xml")

Users_df.printSchema()
Users_df.show(3, vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- _AboutMe: string (nullable = true)
 |-- _AccountId: integer (nullable = true)
 |-- _CreationDate: timestamp (nullable = true)
 |-- _DisplayName: string (nullable = true)
 |-- _DownVotes: integer (nullable = true)
 |-- _Id: integer (nullable = true)
 |-- _LastAccessDate: timestamp (nullable = true)
 |-- _Location: string (nullable = true)
 |-- _Reputation: integer (nullable = true)
 |-- _UpVotes: integer (nullable = true)
 |-- _Views: integer (nullable = true)
 |-- _WebsiteUrl: string (nullable = true)

-RECORD 0-------------------------------
 _AboutMe        | <p>Hi, I'm not re... 
 _AccountId      | -1                   
 _CreationDate   | 2016-01-11 22:16:... 
 _DisplayName    | Community            
 _DownVotes      | 3585                 
 _Id             | -1                   
 _LastAccessDate | 2016-01-11 22:16:... 
 _Location       | on the server farm   
 _Reputation     | 1                    
 _UpVotes        | 51                   
 _Views          | 160         

#### h) Votes

In [14]:
schema_votes = StructType([StructField("_BountyAmount", IntegerType()),\
                            StructField("_CreationDate", TimestampType()),\
                            StructField("_Id", IntegerType()),\
                            StructField("_PostId", IntegerType()),\
                            StructField("_UserId", IntegerType()),\
                            StructField("_VoteTypeId", IntegerType())])

Votes_df = spark.read.format("com.databricks.spark.xml")\
 .option("rowTag", "row")\
 .option("charset", "UTF8")\
 .schema(schema_votes)\
 .option("treatEmptyValuesAsNulls", "true")\
 .load("s3://bigdata-project-data-us-east-1/Votes.xml")

Votes_df.printSchema()
Votes_df.show(3, vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- _BountyAmount: integer (nullable = true)
 |-- _CreationDate: timestamp (nullable = true)
 |-- _Id: integer (nullable = true)
 |-- _PostId: integer (nullable = true)
 |-- _UserId: integer (nullable = true)
 |-- _VoteTypeId: integer (nullable = true)

-RECORD 0----------------------------
 _BountyAmount | null                
 _CreationDate | 2016-01-12 00:00:00 
 _Id           | 1                   
 _PostId       | 1                   
 _UserId       | null                
 _VoteTypeId   | 2                   
-RECORD 1----------------------------
 _BountyAmount | null                
 _CreationDate | 2016-01-12 00:00:00 
 _Id           | 2                   
 _PostId       | 2                   
 _UserId       | null                
 _VoteTypeId   | 2                   
-RECORD 2----------------------------
 _BountyAmount | null                
 _CreationDate | 2016-01-12 00:00:00 
 _Id           | 3                   
 _PostId       | 3                   
 _UserId       | n

### 3. Data cleaning

#### 3.1. Clean column names

#### a) Badges

In [16]:
Badges_DF = Badges_df.select([f.col(colnames).alias(colnames.replace('_', '')) for colnames in Badges_df.columns])
Badges_DF.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+--------------------+---+--------------+--------+------+
|Class|                Date| Id|          Name|TagBased|UserId|
+-----+--------------------+---+--------------+--------+------+
|    3|2016-01-12 18:44:...|  2|Autobiographer|   false|    23|
|    3|2016-01-12 18:44:...|  3|Autobiographer|   false|    22|
|    3|2016-01-12 18:44:...|  4|Autobiographer|   false|    21|
|    3|2016-01-12 18:44:...|  5|Autobiographer|   false|    20|
|    3|2016-01-12 18:44:...|  6|Autobiographer|   false|    19|
|    3|2016-01-12 18:44:...|  7|Autobiographer|   false|    18|
|    3|2016-01-12 18:44:...|  8|Autobiographer|   false|    17|
|    3|2016-01-12 18:44:...|  9|Autobiographer|   false|    16|
|    3|2016-01-12 18:44:...| 10|Autobiographer|   false|    15|
|    3|2016-01-12 18:44:...| 11|Autobiographer|   false|    14|
|    3|2016-01-12 18:44:...| 12|Autobiographer|   false|    13|
|    3|2016-01-12 18:44:...| 13|Autobiographer|   false|    12|
|    3|2016-01-12 18:44:...| 14|Autobiog

#### b) Comments

In [17]:
Comments_DF = Comments_df.select([f.col(colnames).alias(colnames.replace('_', '')) for colnames in Comments_df.columns])
Comments_DF.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------+--------------------+---+------+-----+--------------------+---------------+------+
|ContentLicense|        CreationDate| Id|PostId|Score|                Text|UserDisplayName|UserId|
+--------------+--------------------+---+------+-----+--------------------+---------------+------+
|  CC BY-SA 3.0|2016-01-12 18:47:...|  1|     1|    4|Did I just place ...|           null|    23|
|  CC BY-SA 3.0|2016-01-12 19:01:...|  2|     3|    1|I think it would ...|           null|    10|
|  CC BY-SA 3.0|2016-01-12 19:42:...|  3|     1|    1|What are you look...|           null|    26|
|  CC BY-SA 3.0|2016-01-12 19:42:...|  4|    21|    9|It is worth notin...|           null|    36|
|  CC BY-SA 3.0|2016-01-12 19:44:...|  5|     5|    7|I think this is t...|           null|    36|
|  CC BY-SA 3.0|2016-01-12 19:44:...|  6|    21|    0|Note that this pa...|           null|    46|
|  CC BY-SA 3.0|2016-01-12 19:45:...|  7|     5|    1|I'm voting to clo...|           null|    17|
|  CC BY-S

#### c) PostHistory

In [21]:
PostHistory_DF = PostHistory_df.select([f.col(colnames).alias(colnames.replace('_', '')) for colnames in PostHistory_df.columns])
PostHistory_DF.show(3, vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-RECORD 0---------------------------------
 Comment           | null                 
 ContentLicense    | CC BY-SA 3.0         
 CreationDate      | 2016-01-12 18:45:... 
 Id                | 1                    
 PostHistoryTypeId | 2                    
 PostId            | 1                    
 RevisionGUID      | 6deffe8b-79c7-467... 
 Text              | When I've printed... 
 UserDisplayName   | null                 
 UserId            | 16                   
-RECORD 1---------------------------------
 Comment           | null                 
 ContentLicense    | CC BY-SA 3.0         
 CreationDate      | 2016-01-12 18:45:... 
 Id                | 2                    
 PostHistoryTypeId | 1                    
 PostId            | 1                    
 RevisionGUID      | 6deffe8b-79c7-467... 
 Text              | How to obtain hig... 
 UserDisplayName   | null                 
 UserId            | 16                   
-RECORD 2---------------------------------
 Comment   

#### d) PostLinks

In [19]:
PostLinks_DF = PostLinks_df.select([f.col(colnames).alias(colnames.replace('_', '')) for colnames in PostLinks_df.columns])
PostLinks_DF.show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------------+----+----------+------+-------------+
|CreationDate           |Id  |LinkTypeId|PostId|RelatedPostId|
+-----------------------+----+----------+------+-------------+
|2016-01-12 20:16:38.26 |33  |1         |49    |2            |
|2016-01-12 20:49:28.873|52  |1         |65    |20           |
|2016-01-13 01:52:20.523|216 |1         |81    |49           |
|2016-01-13 17:11:36.973|357 |1         |199   |6            |
|2016-01-13 20:41:04.5  |381 |1         |211   |147          |
|2016-01-13 20:41:04.5  |382 |1         |211   |181          |
|2016-01-13 20:59:21.28 |388 |1         |215   |78           |
|2016-01-14 07:27:32.037|449 |1         |239   |84           |
|2016-01-15 13:05:05.66 |504 |1         |269   |264          |
|2016-01-15 16:33:07.33 |513 |1         |271   |61           |
|2016-01-19 19:52:06.613|639 |1         |334   |54           |
|2016-01-25 12:37:40.963|722 |1         |388   |233          |
|2016-01-25 15:29:29.957|737 |1         |383   |233    

#### e) Posts

In [22]:
Posts_DF = Posts_df.select([f.col(colnames).alias(colnames.replace('_', '')) for colnames in Posts_df.columns])
Posts_DF.show(3, vertical=True, truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 AcceptedAnswerId      | 51                                                                                                                                                                                                                                                                                                                                                                                                                                                        
 AnswerCount           | 2                                      

#### f) Tags

In [23]:
Tags_DF = Tags_df.select([f.col(colnames).alias(colnames.replace('_', '')) for colnames in Tags_df.columns])
Tags_DF.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+-------------+------------+---+------------------+----------+
|Count|ExcerptPostId|CreationDate| Id|           TagName|WikiPostId|
+-----+-------------+------------+---+------------------+----------+
|   27|          434|        null|  1|        resolution|       433|
|   48|          112|        null|  2|             speed|       111|
|   38|          114|        null|  3|           quality|       113|
|   14|          120|        null|  4|            health|       119|
|   60|          489|        null|  6|          material|       488|
|  103|          287|        null|  7|               fdm|       286|
|  285|            8|        null|  8|          filament|         7|
|   55|          428|        null|  9|          makerbot|       427|
|   23|         7059|        null| 10|             color|      7058|
|    5|        10078|        null| 11|          outdoors|     10077|
|   28|          812|        null| 13|           surface|       811|
|    9|         3690|        null|

#### g) Users

In [103]:
Users_DF = Users_df.select([f.col(colnames).alias(colnames.replace('_', '')) for colnames in Users_df.columns])
Users_DF.show(3, vertical=True,truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 AboutMe        | <p>Hi, I'm not really a person.</p>\r\n<p>I'm a background process that helps keep this site clean!</p>\r\n<p>I do things like</p>\r\n<ul>\r\n<li>Randomly poke old unanswered questions every hour so they get some attention</li>\r\n<li>Own community questions and answers so nobody gets unnecessary reputation f

#### h) Votes

In [25]:
Votes_DF = Votes_df.select([f.col(colnames).alias(colnames.replace('_', '')) for colnames in Votes_df.columns])
Votes_DF.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+-------------------+---+------+------+----------+
|BountyAmount|       CreationDate| Id|PostId|UserId|VoteTypeId|
+------------+-------------------+---+------+------+----------+
|        null|2016-01-12 00:00:00|  1|     1|  null|         2|
|        null|2016-01-12 00:00:00|  2|     2|  null|         2|
|        null|2016-01-12 00:00:00|  3|     3|  null|         2|
|        null|2016-01-12 00:00:00|  4|     1|  null|         2|
|        null|2016-01-12 00:00:00|  5|     2|  null|        16|
|        null|2016-01-12 00:00:00|  6|     4|  null|         2|
|        null|2016-01-12 00:00:00|  7|     4|  null|         2|
|        null|2016-01-12 00:00:00|  8|     6|  null|         2|
|        null|2016-01-12 00:00:00|  9|     2|  null|         2|
|        null|2016-01-12 00:00:00| 10|     5|  null|         2|
|        null|2016-01-12 00:00:00| 11|     9|  null|         2|
|        null|2016-01-12 00:00:00| 12|     6|  null|         2|
|        null|2016-01-12 00:00:00| 13|  

#### 3.2. Clean text columns

#### a) Badges_DF - no cleaning needed

#### b) Comments_DF

##### - replace multiple blank spaces with just one space

In [60]:
# count records with multiple blank spaces
Comments_DF.where(Comments_DF.Text.like('%  %')).count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2760

In [56]:
# replace multiple blank spaces with just one space
Comments_DF_clean=Comments_DF.withColumn("Text",f.regexp_replace("Text"," +"," "))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [66]:
# count records with multiple blank spaces again (works fine also with more than two spaces)
Comments_DF_clean.where(Comments_DF_clean.Text.like('%  %')).count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0

#### - removing unnecesarrycolumns and records with NULL values in crucial columns

In [69]:
# remove rows where UserId has NULL values

Comments_DF_clean2=Comments_DF_clean.dropna(subset="UserId")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [70]:
# remove column UserDisplayName
Comments_DF_clean3=Comments_DF_clean2.drop("UserDisplayName")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [71]:
Comments_DF_clean3.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------+--------------------+---+------+-----+--------------------+------+
|ContentLicense|        CreationDate| Id|PostId|Score|                Text|UserId|
+--------------+--------------------+---+------+-----+--------------------+------+
|  CC BY-SA 3.0|2016-01-12 18:47:...|  1|     1|    4|Did I just place ...|    23|
|  CC BY-SA 3.0|2016-01-12 19:01:...|  2|     3|    1|I think it would ...|    10|
|  CC BY-SA 3.0|2016-01-12 19:42:...|  3|     1|    1|What are you look...|    26|
|  CC BY-SA 3.0|2016-01-12 19:42:...|  4|    21|    9|It is worth notin...|    36|
|  CC BY-SA 3.0|2016-01-12 19:44:...|  5|     5|    7|I think this is t...|    36|
|  CC BY-SA 3.0|2016-01-12 19:44:...|  6|    21|    0|Note that this pa...|    46|
|  CC BY-SA 3.0|2016-01-12 19:45:...|  7|     5|    1|I'm voting to clo...|    17|
|  CC BY-SA 3.0|2016-01-12 19:55:...|  9|     1|    1|@TomvanderZanden ...|    16|
|  CC BY-SA 3.0|2016-01-12 19:55:...| 10|    33|    0|Other than it wil...|    23|
|  C

#### c) PostHistory_DF

#### - replace multiple blank spaces with just one space

In [72]:
PostHistory_DF_clean=PostHistory_DF.withColumn("Text",f.regexp_replace("Text"," +"," "))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

#### - removing unnecesarrycolumns and records with NULL values in crucial columns

In [112]:
PostHistory_DF_clean2=PostHistory_DF_clean.dropna(subset="UserId")
PostHistory_DF_clean3=PostHistory_DF_clean2.drop("UserDisplayName")
PostHistory_DF_clean3.show(3,vertical=True,truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Comment           | null                                                                                                                                                                                
 ContentLicense    | CC BY-SA 3.0                                                                                                                                                                        
 CreationDate      | 2016-01-12 18:45:19.963                                                                                                                                                             
 Id                | 1                                                                                                                                                                          

#### d) PostLinks_DF - no cleaning needed

In [75]:
# check on NULL values 

PostLinks_DF.where(PostLinks_DF.CreationDate.isNull()).count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0

In [77]:
PostLinks_DF.where(PostLinks_DF.Id.isNull()).count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0

In [78]:
PostLinks_DF.where(PostLinks_DF.LinkTypeId.isNull()).count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0

In [80]:
PostLinks_DF.where(PostLinks_DF.PostId.isNull()).count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0

In [81]:
PostLinks_DF.where(PostLinks_DF.PostId.isNull()).count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0

In [82]:
PostLinks_DF.where(PostLinks_DF.RelatedPostId.isNull()).count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0

#### e) Posts_DF 

#### - remove "<" and ">" and replace "><" with ","

In [83]:
Posts_DF_clean=Posts_DF.withColumn("Tags",f.regexp_replace("Tags","><",",")) \
                       .withColumn("Tags",f.regexp_replace("Tags","^<","")) \
                       .withColumn("Tags",f.regexp_replace("Tags",">$",""))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

#### - replace multiple blank spaces with just one space

In [88]:
Posts_DF_clean2=Posts_DF_clean.withColumn("Body",f.regexp_replace("Body"," +"," "))\
                              .withColumn("Title",f.regexp_replace("Title"," +"," "))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

#### f) Tags_DF

In [113]:
# checking on NULL values

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [97]:
# count all rows

Tags_DF.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

424

In [99]:
# count all rows with CreationDate NULL values

Tags_DF.where(Tags_DF.CreationDate.isNull()).count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

424

In [114]:
# remove column CreationDate

Tags_DFclean=Tags_DF.drop("CreationDate")
Tags_DFclean.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+-------------+---+------------------+----------+
|Count|ExcerptPostId| Id|           TagName|WikiPostId|
+-----+-------------+---+------------------+----------+
|   27|          434|  1|        resolution|       433|
|   48|          112|  2|             speed|       111|
|   38|          114|  3|           quality|       113|
|   14|          120|  4|            health|       119|
|   60|          489|  6|          material|       488|
|  103|          287|  7|               fdm|       286|
|  285|            8|  8|          filament|         7|
|   55|          428|  9|          makerbot|       427|
|   23|         7059| 10|             color|      7058|
|    5|        10078| 11|          outdoors|     10077|
|   28|          812| 13|           surface|       811|
|    9|         3690| 15|       metal-parts|      3689|
|   91|          535| 17|support-structures|       534|
|   25|          126| 18|             rafts|       125|
|    7|          128| 21|             brims|    

#### g) Users_DF

#### - replace multiple blank spaces with just one space

In [115]:
Users_DF_clean=Users_DF.withColumn("AboutMe",f.regexp_replace("AboutMe"," +"," "))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

#### h) Votes_DF

In [108]:
# check if there are any deleted accounts (UserId=='-1' if user is deleted)
Votes_DF.where(Votes_DF.UserId=='-1').count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0

#### - replace NULL values with '0' in column BountyAmount

In [110]:
Votes_DF_clean=Votes_DF.withColumn("BountyAmount",Votes_DF.BountyAmount).fillna(0, subset="BountyAmount")
Votes_DF_clean.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+-------------------+---+------+------+----------+
|BountyAmount|       CreationDate| Id|PostId|UserId|VoteTypeId|
+------------+-------------------+---+------+------+----------+
|           0|2016-01-12 00:00:00|  1|     1|  null|         2|
|           0|2016-01-12 00:00:00|  2|     2|  null|         2|
|           0|2016-01-12 00:00:00|  3|     3|  null|         2|
|           0|2016-01-12 00:00:00|  4|     1|  null|         2|
|           0|2016-01-12 00:00:00|  5|     2|  null|        16|
|           0|2016-01-12 00:00:00|  6|     4|  null|         2|
|           0|2016-01-12 00:00:00|  7|     4|  null|         2|
|           0|2016-01-12 00:00:00|  8|     6|  null|         2|
|           0|2016-01-12 00:00:00|  9|     2|  null|         2|
|           0|2016-01-12 00:00:00| 10|     5|  null|         2|
|           0|2016-01-12 00:00:00| 11|     9|  null|         2|
|           0|2016-01-12 00:00:00| 12|     6|  null|         2|
|           0|2016-01-12 00:00:00| 13|  

### 4. Saving data in Parquet format on S3

In [116]:
Badges_DF.write.parquet("s3://bigdata-parquet-data/Badges.parquet")
Comments_DF_clean3.write.parquet("s3://bigdata-parquet-data/Comments.parquet")
PostHistory_DF_clean3.write.parquet("s3://bigdata-parquet-data/PostHistory.parquet")
PostLinks_DF.write.parquet("s3://bigdata-parquet-data/PostLinks.parquet")
Posts_DF_clean2.write.parquet("s3://bigdata-parquet-data/Posts.parquet")
Tags_DFclean.write.parquet("s3://bigdata-parquet-data/Tags.parquet")
Users_DF_clean.write.parquet("s3://bigdata-parquet-data/Users.parquet")
Votes_DF_clean.write.parquet("s3://bigdata-parquet-data/Votes.parquet")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…