In [180]:
import numpy as np
import pandas as pd

In [181]:
import findspark
findspark.init()

In [182]:
spark_url = 'local'
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Spark SQL") \
    .getOrCreate()

In [183]:
year='2024'
path=f'../../../Kafka/output_csv/{year}.csv'
df = spark.read.csv(path, header=True, inferSchema=True)
df.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Abstract: string (nullable = true)
 |-- Author: string (nullable = true)
 |-- Aggregation_Type: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Publication_Date: string (nullable = true)
 |-- Institutions: string (nullable = true)
 |-- Keywords: string (nullable = true)



In [184]:
df.head()

Row(Title='Embracing innovation and collaboration: A message from the new Editor-in-Chief', Abstract=None, Author='Li M.', Aggregation_Type='Journal', Publisher='Cancer Letters', Publication_Date='2024-12-28', Institutions='University of Oklahoma College of Medicine', Keywords=None)

In [185]:
from pyspark.sql.functions import split
df_split = df.withColumn("Keywords", split(df["Keywords"], ";"))

# Show the result
df_split.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [186]:
from pyspark.sql.functions import split, explode, trim, lower, col

df_exploded = df_split.withColumn("Keywords", explode(col("Keywords")))
df_cleaned = df_exploded.withColumn("Keywords", lower(trim(col("Keywords"))))
df_counts = df_cleaned.groupBy("Keywords").count()

# Step 5: Filter keywords with count > 1

df_filtered = df_counts.filter(col("Count") > 5)
df_filtered = df_filtered.sort(col("Count").desc())
df_filtered = df_filtered.withColumnRenamed("count", "Count")
df_filtered = df_filtered.filter(~col("Keywords").isin("thailand", "innovation", "research","engineering","open innovation","open source","open data","open access","open science","open source software","engineering education","engineering design","engineering management","engineering mathematics","engineering mechanics","engineering physics","engineering science","engineering technology"))
df_filtered.show(truncate=False)

+---------------------------+-----+
|Keywords                   |Count|
+---------------------------+-----+
|higher education           |74   |
|artificial intelligence    |73   |
|machine learning           |61   |
|educational innovation     |61   |
|green technology innovation|57   |
|sustainability             |47   |
|deep learning              |32   |
|tissue engineering         |28   |
|education                  |26   |
|technology                 |26   |
|biomaterials               |22   |
|3d printing                |20   |
|bibliometric analysis      |18   |
|blockchain                 |17   |
|digital transformation     |17   |
|drug delivery              |16   |
|sustainable development    |16   |
|industry 4.0               |15   |
|stem                       |15   |
|virtual reality            |15   |
+---------------------------+-----+
only showing top 20 rows



In [187]:
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
df_filtered.toPandas().iplot(kind='bar', x='Keywords', y='Count', title='Keyword Count')


In [188]:
df_filtered.toPandas().rename(columns={'count': f'{year}_Count'}, inplace=True)
df_filtered.toPandas().to_csv(f'../../DA/VisualizeData/{year}_keywords_counts.csv', index=False)