# Dataset import and EDA

thanks to https://github.com/trqmorgan for compiling the .csv in his "QFL-Pettijohn-Ternary" repository

In [20]:
import matplotlib.pyplot as plt 
import pandas as pd 
import numpy as np 
from sklearn.cluster import KMeans
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split

In [21]:
data = pd.read_csv("data.csv")


In [5]:
data.head()

Unnamed: 0,Classification,Qm,Qmu,Qp,Plag,Afsp,Lf,Pore,PM+Cem
0,Subarkose,84,65,33,10,22,0,53,33
1,Quartz arenite,91,107,2,6,0,0,79,15
2,Subarkose,66,155,5,13,3,0,52,6
3,Subarkose,126,40,3,12,40,0,30,49
4,Subarkose,131,64,3,10,23,0,58,11


In [6]:
data.dtypes

Classification    object
Qm                 int64
Qmu                int64
Qp                 int64
Plag               int64
Afsp               int64
Lf                 int64
Pore               int64
PM+Cem             int64
dtype: object

In [7]:
data.describe()

Unnamed: 0,Qm,Qmu,Qp,Plag,Afsp,Lf,Pore,PM+Cem
count,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0
mean,96.840909,89.522727,11.318182,8.159091,9.068182,0.931818,37.568182,42.818182
std,29.095627,31.914485,11.141318,13.39202,15.270275,3.245016,21.016492,37.553851
min,25.0,13.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,80.5,64.75,3.0,0.0,0.0,0.0,21.0,16.5
50%,91.0,94.0,6.5,4.0,1.0,0.0,40.5,34.0
75%,107.75,114.25,15.5,9.25,13.5,0.0,52.25,54.5
max,200.0,155.0,39.0,58.0,64.0,18.0,79.0,180.0


### Check unique classes in Classification column

In [8]:
data.Classification.unique()

array(['Subarkose', 'Quartz arenite', 'Arkose', 'Sublitharenite'],
      dtype=object)

In [9]:
def groups(df):
    """groups the dataframe by the "Classification" column and count number of items by "Classification" unique values"""
    return df.groupby('Classification').size()

result = groups(data)
result

Classification
Arkose             7
Quartz arenite    25
Subarkose         11
Sublitharenite     1
dtype: int64

There is an overrepresentation of items in Quartz Arenite (25) and almost none Sublitharenite (1)

In [14]:
def display_df(df):
    """returns dataframe without column "Classification""""
    return df.drop(columns=["Classification"])

0          Subarkose
1     Quartz arenite
2          Subarkose
3          Subarkose
4          Subarkose
5             Arkose
6             Arkose
7             Arkose
8     Quartz arenite
9     Quartz arenite
10    Quartz arenite
11         Subarkose
12            Arkose
13            Arkose
14         Subarkose
15         Subarkose
16    Quartz arenite
17    Quartz arenite
18    Quartz arenite
19         Subarkose
20    Quartz arenite
21    Quartz arenite
22    Quartz arenite
23    Quartz arenite
24    Quartz arenite
25    Quartz arenite
26            Arkose
27    Quartz arenite
28    Quartz arenite
29         Subarkose
30         Subarkose
31         Subarkose
32    Quartz arenite
33    Quartz arenite
34            Arkose
35    Sublitharenite
36    Quartz arenite
37    Quartz arenite
38    Quartz arenite
39    Quartz arenite
40    Quartz arenite
41    Quartz arenite
42    Quartz arenite
43    Quartz arenite
Name: Classification, dtype: object

In [16]:
def cluster_dataframe(df, k=4):
    """Cluster a dataframe using all integer type columns and group them in k clusters"""
    df = df.copy()
    df = df.drop(columns=["Classification"])
    df = df.apply(pd.to_numeric, errors='coerce')
    df = df.dropna()
    df = df.astype(int)
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(df)
    df['cluster'] = kmeans.labels_
    return df

clustered = cluster_dataframe(data, 4)
clustered




Unnamed: 0,Qm,Qmu,Qp,Plag,Afsp,Lf,Pore,PM+Cem,cluster
0,84,65,33,10,22,0,53,33,0
1,91,107,2,6,0,0,79,15,1
2,66,155,5,13,3,0,52,6,1
3,126,40,3,12,40,0,30,49,3
4,131,64,3,10,23,0,58,11,3
5,133,42,1,2,54,0,46,22,3
6,74,72,2,10,64,0,28,50,0
7,83,75,5,8,39,0,39,51,0
8,90,75,13,0,0,0,5,117,2
9,200,80,11,0,0,0,9,0,3


In [17]:
clustered["Classification"] = data["Classification"]

In [18]:
clustered

Unnamed: 0,Qm,Qmu,Qp,Plag,Afsp,Lf,Pore,PM+Cem,cluster,Classification
0,84,65,33,10,22,0,53,33,0,Subarkose
1,91,107,2,6,0,0,79,15,1,Quartz arenite
2,66,155,5,13,3,0,52,6,1,Subarkose
3,126,40,3,12,40,0,30,49,3,Subarkose
4,131,64,3,10,23,0,58,11,3,Subarkose
5,133,42,1,2,54,0,46,22,3,Arkose
6,74,72,2,10,64,0,28,50,0,Arkose
7,83,75,5,8,39,0,39,51,0,Arkose
8,90,75,13,0,0,0,5,117,2,Quartz arenite
9,200,80,11,0,0,0,9,0,3,Quartz arenite


## Using only Qm, Qmu, Qp, Plag, Afsp and Lf Columns


In [19]:
def cluster_dataframe(df, k=4):
    """Cluster a dataframe using all integer type columns and group them in k clusters"""
    df = df.copy()
    df = df.drop(columns=["Classification", "Pore", "PM+Cem"])
    df = df.apply(pd.to_numeric, errors='coerce')
    df = df.dropna()
    df = df.astype(int)
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(df)
    df['cluster'] = kmeans.labels_
    return df

clustered = cluster_dataframe(data, 4)
clustered["Classification"] = data["Classification"]
clustered

Unnamed: 0,Qm,Qmu,Qp,Plag,Afsp,Lf,cluster,Classification
0,84,65,33,10,22,0,0,Subarkose
1,91,107,2,6,0,0,1,Quartz arenite
2,66,155,5,13,3,0,1,Subarkose
3,126,40,3,12,40,0,3,Subarkose
4,131,64,3,10,23,0,3,Subarkose
5,133,42,1,2,54,0,3,Arkose
6,74,72,2,10,64,0,0,Arkose
7,83,75,5,8,39,0,0,Arkose
8,90,75,13,0,0,0,0,Quartz arenite
9,200,80,11,0,0,0,3,Quartz arenite


## Using a neural network (NEED A BIGGER DATASET)

In [24]:
labels = data["Classification"].copy()
features = data.drop(columns=["Classification"]).copy()

In [46]:
def neural_net(labels,features):
    """Train a neural network to classify into labels dataframe given a features dataframe"""
    # Import the `neural_network` function from `sklearn.neural_network`
    from sklearn.neural_network import MLPClassifier
    
    # Create the classifier: clf
    clf = MLPClassifier(hidden_layer_sizes=(128,), activation="relu", max_iter=20, alpha=1e-4,
                     solver='adam', verbose=True, tol=1e-4, random_state=1,
                     learning_rate_init=.1)
    
    # Fit the classifier to the training data
    clf.fit(features, labels)
    
    return clf

In [47]:
clf = neural_net(labels, features)

Iteration 1, loss = 8.18018795
Iteration 2, loss = 24.21230216
Iteration 3, loss = 30.30948296
Iteration 4, loss = 9.88007831
Iteration 5, loss = 13.68022996
Iteration 6, loss = 33.21603596
Iteration 7, loss = 6.38629593
Iteration 8, loss = 3.35115452
Iteration 9, loss = 2.78784699
Iteration 10, loss = 3.73717405
Iteration 11, loss = 4.75834457
Iteration 12, loss = 3.30780738
Iteration 13, loss = 1.81508373
Iteration 14, loss = 1.35263242
Iteration 15, loss = 2.22172046
Iteration 16, loss = 1.20616628
Iteration 17, loss = 0.81909525
Iteration 18, loss = 0.88694307
Iteration 19, loss = 0.85504061
Iteration 20, loss = 0.77372479




In [48]:
for i in range(len(data)):
    print(clf.predict(features.iloc[[i]]))

['Subarkose']
['Quartz arenite']
['Subarkose']
['Subarkose']
['Subarkose']
['Arkose']
['Arkose']
['Subarkose']
['Quartz arenite']
['Quartz arenite']
['Quartz arenite']
['Subarkose']
['Arkose']
['Subarkose']
['Quartz arenite']
['Quartz arenite']
['Quartz arenite']
['Quartz arenite']
['Quartz arenite']
['Quartz arenite']
['Quartz arenite']
['Quartz arenite']
['Quartz arenite']
['Quartz arenite']
['Quartz arenite']
['Quartz arenite']
['Arkose']
['Quartz arenite']
['Quartz arenite']
['Subarkose']
['Subarkose']
['Quartz arenite']
['Quartz arenite']
['Quartz arenite']
['Arkose']
['Quartz arenite']
['Quartz arenite']
['Quartz arenite']
['Quartz arenite']
['Quartz arenite']
['Subarkose']
['Quartz arenite']
['Quartz arenite']
['Quartz arenite']
