# Top stackoverflow tags
Dataset: https://www.kaggle.com/iancuv/stackoverflow-question-favourites

In [1]:
from pyspark.sql import SparkSession

In [125]:
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType, TimestampType

In [3]:
spark = SparkSession.builder.appName("stackOverflow").master("local[*]").getOrCreate()

In [140]:
questions_schema = StructType([
    StructField("Id", IntegerType()), 
    StructField("CreationDate", TimestampType()),
    StructField("ClosedDate", TimestampType()),
    StructField("DeletionDate", TimestampType()),
    StructField("Score", IntegerType()),
    StructField("OwnerUserId", IntegerType()),
    StructField("AnswerCount", IntegerType()),
])

question_tag_schema = StructType([
    StructField("Id", IntegerType()), 
    StructField("Tag", StringType()),
])

In [133]:
questions = spark.read.csv("data/questions.csv", schema=questions_schema, header="true", sep=",",  nullValue = "NA")

In [141]:
question_tags = spark.read.csv("data/question_tags.csv", schema=question_tag_schema, header="true", sep=",", nullValue="NA")

In [183]:
question_tags.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- Tag: string (nullable = true)



In [143]:
questions.limit(100).toPandas()

Unnamed: 0,Id,CreationDate,ClosedDate,DeletionDate,Score,OwnerUserId,AnswerCount
0,1,2008-07-31 23:26:37,NaT,2011-03-28 02:53:47,1,,0
1,4,2008-07-31 23:42:52,NaT,NaT,458,8.0,13
2,6,2008-08-01 00:08:08,NaT,NaT,207,9.0,5
3,8,2008-08-01 01:33:19,2013-06-03 06:00:25,2015-02-11 09:26:40,42,,8
4,9,2008-08-01 01:40:59,NaT,NaT,1410,1.0,58
...,...,...,...,...,...,...,...
95,463,2008-08-02 16:38:27,NaT,2008-08-24 16:40:13,1,,5
96,469,2008-08-02 17:11:16,NaT,NaT,23,147.0,4
97,470,2008-08-02 17:11:47,2016-03-26 06:23:29,NaT,12,71.0,1
98,482,2008-08-02 18:09:56,NaT,NaT,34,77.0,3


In [144]:
question_tags.limit(100).toPandas()

Unnamed: 0,Id,Tag
0,1,data
1,4,c#
2,4,winforms
3,4,type-conversion
4,4,decimal
...,...,...
95,88,linux
96,88,winapi
97,88,visual-c++
98,88,unix


In [223]:
top_tags = questions.join(question_tags, questions["Id"] == question_tags["Id"] , how="inner").drop(question_tags["Id"]).groupBy(df["tag"]).count()

In [219]:
top_tags = df.groupBy(df["tag"]).count()

In [225]:
top_tags.toPandas()

Unnamed: 0,tag,count
0,input,24132
1,iframe,32424
2,standards,3225
3,arguments,7229
4,brackets,869
...,...,...
58251,devpay,1
58252,linked-file,1
58253,business-improvement,1
58254,jfram,1


In [226]:
top_tags.write.parquet("data/top_tags.parquet")