In [1]:
import pandas as pd  # data processing
import numpy as np
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split  # data split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from mp_api import MPRester
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter

In [8]:
"""
_______________________________________________________
Problem 2: Credit Card Fraud
_______________________________________________________
"""

"""
-------------------------------------------------------------------------
Dataset take from https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
-------------------------------------------------------------------------
"""

df = pd.read_csv('creditcard.csv')
df.drop('Time', axis=1, inplace=True)  # deletes the row labeled time

print(df.head())

cases = len(df)
nonfraud_count = len(df[df.Class == 0])
fraud_count = len(df[df.Class == 1])
fraud_percentage = round(fraud_count / nonfraud_count * 100, 2)

print('CASE COUNT')
print('--------------------------------------------')
print('Total number of cases are {}'.format(cases))
print('Number of Non-fraud cases are {}'.format(nonfraud_count))
print('Number of fraud cases are {}'.format(fraud_count))
print('Percentage of fraud cases is {}%'.format(fraud_percentage))
print('--------------------------------------------')

# views transaction amount data for both fraud and non-fraud
nonfraud_cases = df[df.Class == 0]
fraud_cases = df[df.Class == 1]

print('CASE AMOUNT STATISTICS')
print('--------------------------------------------')
print('NON-FRAUD CASE AMOUNT STATS')
print(nonfraud_cases.Amount.describe())
print('--------------------------------------------')
print('FRAUD CASE AMOUNT STATS')
print(fraud_cases.Amount.describe())
print('--------------------------------------------')

df.reset_index()
# Use standard scaler to normalize the two sets to make them closer to each other
sc = StandardScaler()
amount = df['Amount'].values

df['Amount'] = sc.fit_transform(amount.reshape(-1, 1))

print(df['Amount'].head(10))

# train_test_split
# x = everything but class, y = class

X = df.drop('Class', axis=1).values
y = df['Class'].values
df.reset_index()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print('X_train samples : ')
print(X_train[:1])
print('X_test samples : ')
print(X_test[0:1])
print('y_train samples : ')
print(y_train[0:20])
print('y_test samples : ')
print(y_test[0:20])

# MODELING

df.reset_index()
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = np.nan_to_num(df)

# 1. Decision Tree

# tree_model = DecisionTreeClassifier(max_depth=4, criterion='entropy')
# tree_model.fit(X_train, y_train)
# tree_yhat = tree_model.predict(X_test)

# 2. K-Nearest Neighbors

# n = 5

# knn = KNeighborsClassifier(n_neighbors=n)
# knn.fit(X_train, y_train)
# knn_yhat = knn.predict(X_test)

# # 3. Logistic Regression

lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_yhat = lr.predict(X_test)

# # 4. SVM

# svm = SVC()
# svm.fit(X_train, y_train)
# svm_yhat = svm.predict(X_test)

# # 5. Random Forest Tree

# rf = RandomForestClassifier(max_depth=4)
# rf.fit(X_train, y_train)
# rf_yhat = rf.predict(X_test)

#print acurracy scores
# 1. Accuracy score

print('ACCURACY SCORE')
print('------------------------------------------------------------------------')
# print('Accuracy score of the Decision Tree model is {}'.format(accuracy_score(y_test, tree_yhat)))
# print('------------------------------------------------------------------------')
# print('Accuracy score of the KNN model is {}'.format(accuracy_score(y_test, knn_yhat)))
# print('------------------------------------------------------------------------')
print('Accuracy score of the Logistic Regression model is {}'.format(accuracy_score(y_test, lr_yhat)))
print('------------------------------------------------------------------------')
# print('Accuracy score of the SVM model is {}'.format(accuracy_score(y_test, svm_yhat)))
# print('------------------------------------------------------------------------')
# print('Accuracy score of the Random Forest Tree model is {}'.format(accuracy_score(y_test, rf_yhat)))
# print('------------------------------------------------------------------------')


         V1        V2        V3        V4        V5        V6        V7  \
0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9       V10  ...       V21       V22       V23       V24  \
0  0.098698  0.363787  0.090794  ... -0.018307  0.277838 -0.110474  0.066928   
1  0.085102 -0.255425 -0.166974  ... -0.225775 -0.638672  0.101288 -0.339846   
2  0.247676 -1.514654  0.207643  ...  0.247998  0.771679  0.909412 -0.689281   
3  0.377436 -1.387024 -0.054952  ... -0.108300  0.005274 -0.190321 -1.175575   
4 -0.270533  0.817739  0.753074  ... -0.009431  0.798278 -0.137458  0.141267   

        V25       V26       V27       V28  Amount  Class  
0  0.1285

In [9]:
"""
_______________________________________________________
Problem 2: Email Spam
_______________________________________________________
"""

"""
------------------------------------------------------------------------------------------------
Dataset take from https://github.com/SmallLion/Python-Projects/blob/main/Spam-detection/spam.csv
------------------------------------------------------------------------------------------------
"""

spam = pd.read_csv('spam.csv')

text = spam['v2']
label = spam["v1"]
text_train, text_test, label_train, label_test = train_test_split(text, label, test_size=0.2)

# Counts words and occurrences in each email
cv = CountVectorizer()
features = cv.fit_transform(text_train)

# Support vector machine algorithm
# separates data into classes
model = svm.SVC()
model.fit(features, label_train)

features_test = cv.transform(text_test)
print("Accuracy: {}".format(model.score(features_test, label_test)))


Accuracy: 0.9802690582959641


In [10]:
"""
_______________________________________________________
Problem 1: Material Project
_______________________________________________________
"""

#Part 2

OQMD = pd.read_csv("https://ndownloader.figshare.com/files/9158587")
OQMD_df = pd.DataFrame(OQMD)
#Q2P2 - There are 5329how compounds
#Q2P3 - There are 21 total columns
print(OQMD.keys())
#Q2P4

#Removing columns that are not tagged to the correct data types or that just say whether a data point is valid
OQMD_df = OQMD_df.drop(columns = ['Valence A', 'Valence B', 'Magnetic moment [mu_B]', 'Volume per atom [A^3/atom]', 'Band gap [eV]', 'a [ang]', 'b [ang]', 'c [ang]', 'alpha [deg]', 'beta [deg]', 'gamma [deg]', 'Vacancy energy [eV/O atom]', 'Stability [eV/atom]'])
#There are 8 data points remaining
#Conver the formation energy column to the proper floats
OQMD_df['Formation energy [eV/atom]'] = pd.to_numeric(OQMD_df['Formation energy [eV/atom]'],errors='coerce')

#Q2P5
x = OQMD_df['Formation energy [eV/atom]']
plt.hist(x)

mean = x.mean()
std = x.std()

plt.annotate("mean", xy = (mean, 500), xytext = (mean, 500))
plt.annotate("Standard Deviation", xy = (std-1, 500), xytext = (std-1, 500))

plt.show()

print(std)


Index(['Chemical formula', 'A', 'B', 'In literature', 'Valence A', 'Valence B',
       'Radius A [ang]', 'Radius B [ang]', 'Lowest distortion',
       'Formation energy [eV/atom]', 'Stability [eV/atom]',
       'Magnetic moment [mu_B]', 'Volume per atom [A^3/atom]', 'Band gap [eV]',
       'a [ang]', 'b [ang]', 'c [ang]', 'alpha [deg]', 'beta [deg]',
       'gamma [deg]', 'Vacancy energy [eV/O atom]'],
      dtype='object')


NameError: name 'plt' is not defined