# Main Model
This file contains all the code written for my Bachelor Thesis regarding the main model.

This model is able to classify scientific papers into 1 of the 17 different SDGs.

The workflow contains:
- Reading the data
- Cleaning the data
- Preparing the data
- Quality checking the data
- Plots
- Main modeling part
- Evaluation part

To check the complete worflow please refer to the other file in this folder: "SDG_mapping_complete_workflow.ipynb"

Regular quality checks are done and will not be described explicitly.

# Installation and Imports

In [None]:
!pip install langdetect

In [None]:
!pip install pandas

In [4]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824028 sha256=1e046ba7c4318e7b86936b58d0eec6c86e925c3b811b2f7078cde0845f3eeb8e
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [22]:
import pandas as pd
import numpy as np
from IPython.display import display
from pyspark.sql import SparkSession
from pyspark.ml.feature import Word2Vec, StringIndexer, OneHotEncoder
from pyspark.sql.functions import array
# from google.colab import drive
# drive.mount('/content/gdrive')

# Reading the data

In [6]:
df = pd.read_csv('osdg-community-data_high_IAA.tsv', sep='\t')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28649 entries, 0 to 28648
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   doi              28649 non-null  object 
 1   text_id          28649 non-null  object 
 2   text             28649 non-null  object 
 3   sdg              28649 non-null  int64  
 4   labels_negative  28649 non-null  int64  
 5   labels_positive  28649 non-null  int64  
 6   agreement        28649 non-null  float64
dtypes: float64(1), int64(3), object(3)
memory usage: 1.5+ MB


In [7]:
# df.shape
df.groupby('sdg').count()
# display(df)
# df.dtypes

Unnamed: 0_level_0,doi,text_id,text,labels_negative,labels_positive,agreement
sdg,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1743,1743,1743,1743,1743,1743
2,1386,1386,1386,1386,1386,1386
3,2187,2187,2187,2187,2187,2187
4,2983,2983,2983,2983,2983,2983
5,3401,3401,3401,3401,3401,3401
6,1879,1879,1879,1879,1879,1879
7,2398,2398,2398,2398,2398,2398
8,1104,1104,1104,1104,1104,1104
9,1066,1066,1066,1066,1066,1066
10,945,945,945,945,945,945


# Data cleaning and processing
In this code section the data is being cleaned and processed.

In [8]:
# Define coltypes

df = df.astype({'doi':'string'})
df = df.astype({'text_id':'string'})
df = df.astype({'text':'string'})
df = df.astype({'sdg':'string'})
df = df.astype({'labels_negative':'int'})
df = df.astype({'labels_positive':'int'})
df = df.astype({'agreement':'int'})

# df[['kurzfassung']] = df[['kurzfassung']].fillna(value='unknown')
# df = df.drop(df[df.kurzfassung == 'unknown'].index)

# def language_detect(x):
#     lang = detect(x)
#     return lang
#
# df['language'] = df['kurzfassung'].apply(language_detect)
# df.groupby('language').count()
# df = df.astype({'language':'string'})
# df = df.drop(df[df.language == 'de'].index)
# df.groupby('language').count()

In [9]:
# SDGs to int

df['sdg'] = df['sdg'].str.replace('SDG', '') # remove SDG
df['sdg'] = df['sdg'].astype('int') # change column type to integer

In [10]:
df.columns

Index(['doi', 'text_id', 'text', 'sdg', 'labels_negative', 'labels_positive',
       'agreement'],
      dtype='object')

# PySpark

In this section PySpark is being used for:
- Word2Vec
- StringIndexing
- OneHotEncoding


In [11]:
# create a spark session
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [36]:
spark_df = spark.createDataFrame(df)

spark_df_arrayed = spark_df.withColumn("doi", array(spark_df["doi"]))
spark_df_arrayed = spark_df.withColumn("text_id", array(spark_df["text_id"]))
spark_df_arrayed = spark_df.withColumn("text", array(spark_df["text"]))

In [37]:
spark_df_arrayed.printSchema()

root
 |-- doi: string (nullable = true)
 |-- text_id: string (nullable = true)
 |-- text: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- sdg: long (nullable = true)
 |-- labels_negative: long (nullable = true)
 |-- labels_positive: long (nullable = true)
 |-- agreement: long (nullable = true)



In [40]:
# Word vectors for text columns

def word_vectorizer(df, col, col_new):
    #create an average word vector for text columns
    word2vec = Word2Vec(vectorSize = 10, minCount = 1, inputCol = col, outputCol = col_new)
    model = word2vec.fit(df)
    message_w2vec = model.transform(df)

    # drop original column
    message_w2vec = message_w2vec.drop(col)

    return message_w2vec

# TODO: Try creating a new df for each function call?
# Apply function for text columns
col = 'doi'
col_new = 'doi_vectorized'
spark_df_vectorized = word_vectorizer(spark_df_arrayed, col, col_new)

col = 'text_id'
col_new = 'text_id_vectorized'
spark_df_vectorized = word_vectorizer(spark_df_vectorized, col, col_new)

col = 'text'
col_new = 'text_vectorized'
spark_df_vectorized = word_vectorizer(spark_df_vectorized, col, col_new)

IllegalArgumentException: ignored

In [39]:
spark_df_vectorized.printSchema()

root
 |-- doi: string (nullable = true)
 |-- text_id: string (nullable = true)
 |-- sdg: long (nullable = true)
 |-- labels_negative: long (nullable = true)
 |-- labels_positive: long (nullable = true)
 |-- agreement: long (nullable = true)
 |-- text_vectorized: vector (nullable = true)



# Main Model
In this section the main model is trained.

In [None]:
# data split into train and test

(trainDF, testDF) = spark_df_vectorized.randomSplit([0.7, 0.3], seed=12)

trainDF2 = trainDF.groupby('sdg').count()
trainDF2.show()

testDF2 = testDF.groupby('sdg').count()
testDF2.show()

In [None]:
# main model

from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


if __name__ == "__main__":
    spark = SparkSession \
    .builder \
    .appName("MulticlassLogisticRegressionWithElasticNet") \
    .getOrCreate()

spark_df_renamed = spark_df_vectorized.withColumnRenamed('sdg', 'label')

feature = spark_df_renamed.drop("label").columns
print(feature)

(trainDF, testDF) = spark_df_renamed.randomSplit([0.7, 0.3], seed=12)

# This includes both the numeric columns and the one-hot encoded binary vector columns in our dataset.
vecAssembler = VectorAssembler(inputCols=feature, outputCol="features")

# maxIter=10, regParam=0.3, elasticNetParam=0.8
lr = LogisticRegression()

#Build the Pipeline
# Define the pipeline based on the stages created in previous steps.
pipeline = Pipeline(stages=[vecAssembler, lr])

# Define the pipeline model.
pipelineModel = pipeline.fit(trainDF)


# Create model.
lrModel = pipelineModel.stages[-1]
    
print("#######\n#######\n#######\n")
print("Now printing: Model test")

# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

trainingSummary = lrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# for multiclass, we can inspect metrics on a per-label basis
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i+1, rate))

print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i+1, rate))

print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i+1, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i+1, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i+1, f))

accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
    % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

print("#######\n#######\n#######\n")
print("Now printing: Model validation prediction")

# Apply the pipeline model to the test dataset.
validation_prediction = pipelineModel.transform(testDF)

#Display the predictions from the model. The features column is a sparse vector, which is often the case after one-hot encoding, #because there are so many 0 values
validation_prediction.select("features", "label", "prediction").show()

#Evaluate the model
#The display command has a built-in ROC curve option.

print(pipelineModel.stages[-1], validation_prediction.drop("prediction", "rawPrediction"), "ROC")

mcEvaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print(f"Accuracy: {mcEvaluator.evaluate(validation_prediction)}")


In [None]:
# Quality check on how the model performs on each different SDG
from pyspark.sql.functions import sum, cols_to_drop_later, desc
from pyspark.sql import functions as F

validation_count = validation_prediction.groupby('label', 'prediction').count()
validation_count = validation_count.sort(desc("count"))
validation_count.show(200)
y = validation_prediction.count()


data_collect = validation_count.collect()

x = 0
# looping thorough each row of the dataframe
for row in data_collect:
    if row['label'] == row['prediction']:
      x += row['count']

acc = x / y
print("accuracy:", acc)

to_check_sdg = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
d_error_by_label = {}
cols_to_drop_later = ['label', 'prediction']

for iii in to_check_sdg:
  new_df_validation = validation_count.filter(cols_to_drop_later("label") == iii)
  dfxx = new_df_validation.toPandas()
  listxx = dfxx['count'].tolist()
  #print(listxx)
  count_total = 0.0
  for x in listxx:
    count_total += x
  count_total *= 1.0
  print("count_total",count_total)

  dfxxx = dfxx[dfxx['label'] != dfxx['prediction']]
  listxx = dfxxx['count'].tolist()
  count_total_errors = 0.0
  for x in listxx:
    count_total_errors += x
  count_total_errors *= 1.0
  print("count_total_errors",count_total_errors)
  d_error_by_label[iii] = 1 - count_total_errors/count_total

print("errors:", d_error_by_label)


+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|   15|      15.0|   13|
|    3|       3.0|    7|
|   13|      13.0|    7|
|   16|      16.0|    6|
|    5|       5.0|    4|
|    8|      10.0|    4|
|    9|      12.0|    3|
|   12|      12.0|    3|
|    2|       2.0|    3|
|    7|       7.0|    3|
|   15|      13.0|    3|
|   15|      14.0|    3|
|   12|      17.0|    2|
|   10|      17.0|    2|
|   16|       3.0|    2|
|   15|       1.0|    2|
|    9|       9.0|    2|
|    8|       3.0|    2|
|    8|       9.0|    2|
|    5|       8.0|    1|
|    3|      14.0|    1|
|    3|      16.0|    1|
|    5|      12.0|    1|
|    6|      13.0|    1|
|   10|      12.0|    1|
|   10|      16.0|    1|
|   10|       5.0|    1|
|   11|      14.0|    1|
|    8|       8.0|    1|
|   10|      10.0|    1|
|   13|      15.0|    1|
|    3|      10.0|    1|
|    1|       8.0|    1|
|    4|       8.0|    1|
|    5|      10.0|    1|
|    7|       2.0|    1|
|    1|       1.0|    1|
