In [74]:
#Dependencies
from sklearn.model_selection import train_test_split
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark import SparkContext, since
from pyspark.ml.feature import CountVectorizer, StopWordsRemover, HashingTF, IDF, Tokenizer, StringIndexer

In [75]:
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [78]:
#Read in CSV 
df = spark.read.format("csv").option("header", "true").load("FedReporter.csv")
df.show()

+------+-------------------+-------------------+---------+----------+-----------+--------------------+----------+------------+-----------------+--------------------+--------------------+------------------+----------------+----------------------+--------------------+-------------------+--------------------+-------------------+--------------------+--------------------+----------+
|Agency|    Budget End Date|  Budget Start Date|CFDA code|Department|Fiscal Year|           Key Words|  Latitude|   Longitude|Organization City|Organization Country|   Organization Name|Organization State|Organization ZIP|Principal Investigator|    Project Abstract|   Project End Date|      Project Number| Project Start Date|       Project Title|           Seed Term|Total Cost|
+------+-------------------+-------------------+---------+----------+-----------+--------------------+----------+------------+-----------------+--------------------+--------------------+------------------+----------------+----------------

In [92]:
df.count()

28303

In [93]:
# Remove duplicate projects


In [94]:
# Abstracts
# ---------------

#Tokenize words in abstracts
tokenizer = Tokenizer(inputCol="Project Abstract", outputCol="abstract_words_raw")
tokenizer

Tokenizer_41c9831f753c21ee2acb

In [95]:
# Create a function to return the length of a list
def abstract_word_list_length(abstract_word_list):
    return len(abstract_word_list)

In [96]:
# Create a user defined function 
count_tokens = udf(abstract_word_list_length, IntegerType())
count_tokens

<function __main__.abstract_word_list_length>

In [97]:
# Transform dataframe
abstract_tokenized_df = tokenizer.transform(df)

# Select the needed columns and don't truncate results
abstract_tokenized_df.select("Project Abstract", "abstract_words_raw")\
    .withColumn("tokens", count_tokens(col("abstract_words_raw"))).show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [98]:
#instantiate remover
remover = StopWordsRemover(inputCol="abstract_words_raw", outputCol="abstract_words_filtered")

In [99]:
# transform and show filtered data
abstract_filtered_df = remover.transform(abstract_tokenized_df)

In [100]:
# #TF/IDF work-round 
# ------------
#String indexer to check vocab_size

# indexer = StringIndexer(inputCol="abstract_words_filtered", outputCol="abstract_words_Indexed")
# indexed = indexer.fit(abstract_filtered_df).transform(abstract_filtered_df)
# indexed.show()

In [101]:
# Generate Term Frequency (TF)

# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="abstract_words_filtered", outputCol="cv_Features", vocabSize=100000)
cv_model = cv.fit(abstract_filtered_df)
cv_result = cv_model.transform(abstract_filtered_df)

# Select the needed columns and don't truncate results
for words in cv_result.select("abstract_words_filtered","cv_Features").take(3):
    print(words)


Row(abstract_words_filtered=['advances', 'sequencing', 'technology', 'dramatically', 'shifted', 'way', 'science', 'performed', 'many', 'fields', 'biology,', 'also', 'shifted', 'way', 'education', 'next', 'generation', 'scientists', 'must', 'conducted.', 'however,', 'generation', 'sequencing', 'data', 'restricted', 'large', 'laboratories,', 'consortia', 'laboratories,', 'core', 'sequencing', 'facilities,', 'limiting', 'exposure', 'undergraduates', 'technologies.', 'recent', 'introduction', 'benchtop', 'next', 'generation', 'sequencing', 'platforms', 'promises', 'make', 'moderately', 'high', 'throughput', 'sequencing', 'available', 'many', 'individual', 'research', 'laboratories', 'smaller', 'institutions.', 'major', 'research', 'instrumentation', 'grant', 'supports', 'acquisition', 'ion', 'torrent', 'personal', 'genome', 'machine', '(ion', 'pgm)', 'sequencing', 'platform', 'hope', 'college.', 'ion', 'pgm', 'significant', 'impact', 'undergraduate', 'research', 'training', 'hope', 'colleg

In [102]:
#Generate IDF Model (Inverse Document Frequency) 
idf = IDF(inputCol="cv_Features", outputCol="idf_features")
idfModel = idf.fit(cv_result)
rescaledData = idfModel.transform(cv_result)

for features_label in rescaledData.select("idf_features", "abstract_words_filtered").take(20):
    print(features_label)

Row(idf_features=SparseVector(50663, {2: 1.6419, 4: 8.7403, 5: 1.003, 7: 1.6404, 8: 1.3756, 9: 0.7737, 10: 0.8027, 13: 20.1663, 20: 5.3513, 21: 0.8351, 24: 1.2127, 27: 1.0057, 29: 2.8801, 34: 1.1249, 35: 3.9023, 37: 1.0098, 38: 1.1757, 42: 0.8133, 49: 2.0758, 54: 3.0154, 58: 4.8335, 61: 1.3413, 62: 3.1883, 63: 2.5538, 65: 4.5462, 78: 2.0851, 90: 3.0813, 101: 3.5613, 103: 1.6609, 104: 1.774, 109: 1.619, 125: 1.6652, 128: 7.7222, 130: 3.4986, 141: 1.68, 145: 2.2249, 148: 1.813, 157: 3.7237, 158: 1.9209, 161: 6.1081, 163: 1.9843, 166: 2.3579, 171: 5.6967, 172: 2.1248, 176: 1.8206, 177: 1.8133, 180: 5.5038, 189: 6.2957, 198: 1.9944, 199: 4.0192, 201: 2.9839, 202: 2.2084, 207: 22.8698, 218: 2.3475, 227: 2.1576, 237: 2.164, 239: 2.1868, 240: 4.6147, 244: 2.0306, 252: 2.1009, 257: 2.2023, 258: 2.3354, 259: 4.2508, 262: 2.0499, 278: 7.1201, 285: 11.2756, 292: 2.2484, 312: 2.4073, 315: 2.4356, 319: 2.2602, 320: 2.3292, 331: 4.5593, 335: 3.145, 349: 2.4152, 357: 7.6163, 366: 2.3913, 368: 2.7112,

In [103]:
spark.stop()