In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp, col, lower, regexp_replace, split

In [3]:
spark= SparkSession.builder.config('spark.ui.port','4040').getOrCreate()

In [28]:
authors = spark.read.csv("file:///N/project/mag/mag-2021-01-05/mag/Authors.txt",\
                         header = False,\
                         sep=r'\t')
authors.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)



In [22]:
def clean_text(c):
  c = lower(c)
  c = regexp_replace(c, "^rt ", "")
  c = regexp_replace(c, "(https?\://)\S+", "")
  c = regexp_replace(c, "[^a-zA-Z0-9\\s]", "")
  #c = split(c, "\\s+") tokenization...
  return c

In [29]:
authors = authors.select(
        authors._c0.cast("int").alias("authorId"),
        authors._c1.cast("int").alias("rank"),
        clean_text(col("_c3")).alias("displayName"),
        authors._c2.cast("string").alias("normalizedName"),
        authors._c4.cast("int").alias("lastKnownAffiliationId"),
        authors._c5.cast("int").alias("paperCount"),
        authors._c7.cast("int").alias("citationCount"), 
        to_timestamp(authors._c8, "yyyy-MM-dd").alias("createdDate")
        )

In [30]:
authors.printSchema()

root
 |-- authorId: integer (nullable = true)
 |-- rank: integer (nullable = true)
 |-- normalizedName: string (nullable = true)
 |-- displayName: string (nullable = true)
 |-- lastKnownAffiliationId: integer (nullable = true)
 |-- paperCount: integer (nullable = true)
 |-- citationCount: integer (nullable = true)
 |-- date: timestamp (nullable = true)



In [31]:
authors.show(10)

+--------+-----+--------------------+--------------------+----------------------+----------+-------------+-------------------+
|authorId| rank|      normalizedName|         displayName|lastKnownAffiliationId|paperCount|citationCount|               date|
+--------+-----+--------------------+--------------------+----------------------+----------+-------------+-------------------+
|     584|19419|gozde ozdikmenlid...| gzde zdikmenlidemir|              79946792|         5|           11|2016-06-24 00:00:00|
|     859|19820|           gy tolmar|            gy tolmr|                  null|         3|            2|2016-06-24 00:00:00|
|     978|17891|      ximena faundez|       ximena fandez|             149744451|        19|           62|2016-06-24 00:00:00|
|    1139|19630|      jennifer putzi|      jennifer putzi|                  null|         4|            6|2016-06-24 00:00:00|
|    1476|21131|           勲矢 手島|                    |                  null|         1|            0|2016-06-2

In [36]:
authors.coalesce(10).write.option("header","true") \
                          .option("sep","~") \
                          .option("quoteAll", True) \
                          .mode("overwrite") \
                          .csv('/N/project/mag/mag_jg_2021_update/nodes/authors') 