In [0]:
#This section is for datacollection
#The dataset is found here ---->   https://archive.ics.uci.edu/dataset/410/paper+reviews
#The dataset is licensed under a Creative Commons Attribution 4.0 International (CC BY 4.0) license
#github push test
import json
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import statistics
filePath = 'dataset/reviews.json'


decisionCounts = {
    'accept': 0,
    'probably reject': 0,
    'reject': 0,
    'no decision': 0
}


evaluations = []
confidences = []
orientations = []
missingValuesPositions = []
nullValuesCount= 0

# Open the .JSON file 
with open(filePath, 'r', encoding='utf-8') as file:
    data = json.load(file)


for paper in data['paper']:
    print("Paper ID:", paper['id'])
    decision = paper['preliminary_decision'].lower()  
    print("Preliminary Decision:", decision)


    if decision in decisionCounts:
        decisionCounts[decision] += 1
    else:
        
        decisionCounts[decision] = 1

    print("Reviews:")
    for review in paper['review']:
        print("  Review ID:", review['id'])
        print("  Evaluation:", review['evaluation'])
        print("  Confidence:", review['confidence'])
        print("  Date:", review['timespan'])
        print("")
    print()  
    

evaluations = []
confidences = []
orientations = []
decisions = []


decisionMapping = {
    "no decision": 0,  
    "accept": 1,
    "probably reject": 2,
    "reject": 3
}

for paper in data['paper']:
    for review in paper['review']:
        if 'confidence' in review and review['confidence'] is not None:
            confidences.append(int(review['confidence']))


medianConfidence = np.median(confidences)  


evaluations = []
confidences = []
orientations = []
decisions = []

for paper in data['paper']:
    paperDecision = paper['preliminary_decision']
    # Converting the decision, in text, into a numerical code
    
    decisionCode = decisionMapping.get(paperDecision, 0)  

    for review in paper['review']:
        if 'evaluation' in review and review['evaluation'] is not None:
            evaluations.append(int(review['evaluation']))
        else:
            nullValuesCount += 1

        if 'confidence' in review and review['confidence'] is not None:
            confidences.append(int(review['confidence']))
        else:
            nullValuesCount += 1
            confidences.append(int(medianConfidence))  # assuming medianConfidence is calculated elsewhere

        if 'orientation' in review and review['orientation'] is not None:
            orientations.append(int(review['orientation']))
        else:
            nullValuesCount += 1
        
        # Append the decision code for each review
        decisions.append(decisionCode)

for paper in data['paper']:
    for review in paper['review']:
        if 'decision' in review:
            
            decisionClass = review['decision']
        else:
            decisionClass = 0  

       # decisions.append(decisionClass)

print(f"There are combined {nullValuesCount} null values for orientation, confidence and evaluation.")

evaluationsNP = np.array(evaluations)



confidencesArray = np.array(confidences)
meanConfidence = np.mean(confidencesArray)
medianConfidence = np.median(confidencesArray)

print(f"The length of 'confidences' is {len(confidences)}")
print(f"The length of 'evaluations' is {len(evaluations)}")

In [0]:
#This section is for DBSCAN


import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import matplotlib.pyplot as plt



#The vectors are defined from the dataset
df = pd.DataFrame({
    'Evaluation': evaluations,
    'Confidence': confidences,
    'Orientation': orientations,
    'Decision': decisions
})

X = df[['Confidence', 'Orientation', 'Decision']]
scaler = StandardScaler()
xScaled = scaler.fit_transform(X)

#Note : when using a esp value above 1.5 we do only get one cluster. 
# Here i am defining the hyperparameters i want to test
# EPSILON : maximum distance between two samples for one to be considered in the neighborhood of some other
epsilonValues = [0.5, 1, 1.1, 1.3, 1.5]  
# The number of samples in a neighborhood for a point to be considered as a core point
minSamplesValues = [5, 10, 15, 20, 25]  

for epsilon in epsilonValues:
    for minSamples in minSamplesValues:
        dbscan = DBSCAN(eps=epsilon, min_samples=minSamples)
        clusterLabels = dbscan.fit_predict(xScaled)

        # Silhouette Score can be used but might be less meaningful for DBSCAN
        # Especially when the number of noise points is high or clusters are of varied density
        if len(np.unique(clusterLabels)) > 1:  
            silhouetteAvg = silhouette_score(xScaled, clusterLabels)
            print(f"DBSCAN with eps={epsilon}, min_samples={minSamples}, there are {len(np.unique(clusterLabels))} cluster ,silhouette score: {silhouetteAvg}")
            
           
           
            silhouetteAvg = silhouette_score(xScaled, clusterLabels)
            daviesBouldin = davies_bouldin_score(xScaled, clusterLabels)
            
            print(f"DBSCAN with eps={epsilon}, min_samples={minSamples}, clusters: {len(np.unique(clusterLabels))}, silhouette: {silhouetteAvg}, davies-bouldin index: {daviesBouldin}")
            
        else:
            #If there are only one cluster the silhouette score cannot be computed
            #This because the silhouette score is determined by the relationship between the clusters
            print(f"DBSCAN with eps={epsilon}, min samples={minSamples} found only one single cluster")
        
        