In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [8]:
#these graphs will have the following columns:
#File Name,label,Number of Nodes,Number of Edges,Average Degree,Theoretical Avg Clustering,Average Clustering,Theoretical Avg Path Length,Average Path Length,Log of Nodes,Log Log of Nodes,Size of Largest CC
#You can avoid to use the following columns: File Name
#the label of each row can be found in the column: label
#Can you run several machine learning algorithms over these datasets. Partition each dataset into training and testing sets and then report accuracy
# in each dataset seperatly. Use classification methods fron K neirest neighbors, Random Forests, XGBoost, regularization methods, MLPs
# Load the datasets
# Load the datasets
csv_path1 = 'graph_statistics-4.5.csv'
csv_path2 = 'graph_statistics.csv'

df1 = pd.read_csv(csv_path1)
df2 = pd.read_csv(csv_path2)

# Drop 'File Name' column and separate features and labels
def prepare_data(df):
    df = df.drop(columns=['File Name'])
    X = df.drop(columns=['label'])
    y = df['label']
    return X, y

X1, y1 = prepare_data(df1)
X2, y2 = prepare_data(df2)

# Encode the labels
label_encoder = LabelEncoder()
y1 = label_encoder.fit_transform(y1)
y2 = label_encoder.fit_transform(y2)

# Standardize the features
scaler = StandardScaler()
X1 = scaler.fit_transform(X1)
X2 = scaler.fit_transform(X2)

# Split datasets into training and testing sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

# Define the classifiers
classifiers = {
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'MLP Classifier': MLPClassifier(random_state=42)
}

# Train and evaluate the classifiers on both datasets
results = {}

for name, clf in classifiers.items():
    clf.fit(X1_train, y1_train)
    y1_pred = clf.predict(X1_test)
    acc1 = accuracy_score(y1_test, y1_pred)
    
    clf.fit(X2_train, y2_train)
    y2_pred = clf.predict(X2_test)
    acc2 = accuracy_score(y2_test, y2_pred)
    
    results[name] = {'Dataset 1 Accuracy': acc1, 'Dataset 2 Accuracy': acc2}

# Display the results
for clf_name, acc in results.items():
    print(f"{clf_name}:")
    print(f"  Dataset 1 Accuracy: {acc['Dataset 1 Accuracy']:.4f}")
    print(f"  Dataset 2 Accuracy: {acc['Dataset 2 Accuracy']:.4f}")
    print()

K-Nearest Neighbors:
  Dataset 1 Accuracy: 0.2500
  Dataset 2 Accuracy: 0.2266

Random Forest:
  Dataset 1 Accuracy: 0.3438
  Dataset 2 Accuracy: 0.1953

XGBoost:
  Dataset 1 Accuracy: 0.3359
  Dataset 2 Accuracy: 0.2188

Logistic Regression:
  Dataset 1 Accuracy: 0.2734
  Dataset 2 Accuracy: 0.2734

MLP Classifier:
  Dataset 1 Accuracy: 0.2969
  Dataset 2 Accuracy: 0.2656





In [10]:
import pandas as pd

# Load the datasets
csv_path1 = 'graph_statistics-4.5.csv'
csv_path2 = 'graph_statistics.csv'

df1 = pd.read_csv(csv_path1)
df2 = pd.read_csv(csv_path2)

# Drop 'File Name' column
df1 = df1.drop(columns=['File Name'])
df2 = df2.drop(columns=['File Name'])

# Group by 'label' and calculate the mean for each group
summary_df1 = df1.groupby('label').mean().reset_index()
summary_df2 = df2.groupby('label').mean().reset_index()

# Display the summary dataframes4
print("Summary DataFrame 1 STD: .5")
summary_df1

Summary DataFrame 1 STD: 4.5


Unnamed: 0,label,Number of Nodes,Number of Edges,Average Degree,Theoretical Avg Clustering,Average Clustering,Theoretical Avg Path Length,Average Path Length,Log of Nodes,Log Log of Nodes,Size of Largest CC
0,BPP,718.0,5083.513333,14.160204,0.019749,0.109788,12.080619,4.309249,6.57647,1.883498,146.413333
1,CON,718.0,1672.450495,4.658636,0.006497,0.089422,9.371822,4.003032,6.57647,1.883498,118.435644
2,SADP,718.0,2622.663866,7.30547,0.010189,0.094796,8.18154,4.107825,6.57647,1.883498,147.941176
3,SCZP,718.0,8815.347305,24.555285,0.034247,0.120812,6.689573,4.355858,6.57647,1.883498,156.892216


In [11]:
print("\nSummary DataFrame 2 STD: 2")
summary_df2


Summary DataFrame 2 STD: 2


Unnamed: 0,label,Number of Nodes,Number of Edges,Average Degree,Theoretical Avg Clustering,Average Clustering,Theoretical Avg Path Length,Average Path Length,Log of Nodes,Log Log of Nodes,Size of Largest CC
0,BPP,718.0,82137.066667,228.794058,0.319099,0.553685,1.228181,1.689993,6.57647,1.883498,717.833333
1,CON,718.0,78250.405941,217.967705,0.304,0.550745,1.230697,1.705816,6.57647,1.883498,717.876238
2,SADP,718.0,78832.07563,219.587954,0.306259,0.541535,1.231511,1.704423,6.57647,1.883498,717.840336
3,SCZP,718.0,84185.670659,234.500475,0.327058,0.557667,1.224293,1.681004,6.57647,1.883498,717.886228
