In [22]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,roc_auc_score
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn.manifold import Isomap


from astropy.io import ascii
from utils import col_names, normalise_sdss_class, ellipticity, filling_factor
import matplotlib.pyplot as plt
%matplotlib inline

In [23]:
relevant_indices = [6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]


In [24]:
datasets = "../datasets/SuperCOSMOS/"

#load data
uki823_df = ascii.read(datasets + "UKI823/sssedrpair.dat", guess=False, Reader=ascii.FastNoHeader).to_pandas()

#Get relevant column names
colnames_relevant=[col_names[i] for i in relevant_indices]

#Get Relevant columns
data=uki823_df.iloc[:, relevant_indices]

#Create data framhttp://localhost:8888/notebooks/Masters/DME/Project/dme-mini-project/notebooks/classification_with_cols_19_21.ipynb#e
data = pd.DataFrame(data.values, columns = colnames_relevant)   

#Add in ellipticity, filling factor and normalise CLASS_SDSS
data['Ellipticity']=  ellipticity(uki823_df.iloc[:,15],uki823_df.iloc[:,16])
data['Filling Factor']= filling_factor(data['AREA'], uki823_df.iloc[:,12],uki823_df.iloc[:,13])
data=normalise_sdss_class(data)

#One Hot Encode

In [25]:
enc = OneHotEncoder()
X=enc.fit_transform(data['CLASS'].values.reshape(-1,1)).toarray()


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [26]:
dfOneHot = pd.DataFrame(X, columns = ["Class_"+str(int(i)) for i in range(X.shape[1])])
data['CLASS']

0        2.0
1        1.0
2        2.0
3        2.0
4        2.0
5        2.0
6        1.0
7        2.0
8        1.0
9        1.0
10       2.0
11       1.0
12       2.0
13       1.0
14       2.0
15       2.0
16       2.0
17       2.0
18       2.0
19       2.0
20       1.0
21       1.0
22       1.0
23       2.0
24       1.0
25       2.0
26       2.0
27       2.0
28       1.0
29       2.0
        ... 
15615    1.0
15616    1.0
15617    1.0
15618    1.0
15619    1.0
15620    1.0
15621    2.0
15622    1.0
15623    1.0
15624    1.0
15625    2.0
15626    1.0
15627    2.0
15628    1.0
15629    2.0
15630    1.0
15631    1.0
15632    1.0
15633    1.0
15634    1.0
15635    2.0
15636    1.0
15637    1.0
15638    2.0
15639    1.0
15640    1.0
15641    2.0
15642    1.0
15643    2.0
15644    1.0
Name: CLASS, Length: 15645, dtype: float64

In [27]:
dfOneHot

Unnamed: 0,Class_0,Class_1,Class_2,Class_3
0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0
5,0.0,1.0,0.0,0.0
6,1.0,0.0,0.0,0.0
7,0.0,1.0,0.0,0.0
8,1.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0


In [28]:
data = pd.concat([data, dfOneHot], axis=1)
#Remove Col class
data = data.drop("CLASS", axis=1) 


In [29]:
#Seperate data into variables and classification
data_x=data.loc[:,data.columns!='CLASS_SDSS']
data_y=data['CLASS_SDSS']

In [30]:
any(np.isinf(data_x['Filling Factor']))

False

In [31]:
#Split dataset into train, val, test
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import log_loss

X_train,X_test,y_train,y_test = train_test_split(data_x,data_y,test_size=0.1,random_state=1)
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train,test_size=2./9,random_state=1)

sc = StandardScaler().fit(X_train)
X_train_sc = sc.transform(X_train)
X_val_sc = sc.transform(X_val)
X_test_sc = sc.transform(X_test)

random_state = 1

In [32]:
# X_train_sc=bwe_data_train
# X_val_sc=bwe_data_val

In [33]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.neural_network import MLPClassifier
# from sklearn.svm import SVC
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import f1_score,roc_auc_score

names = ["Logistic Regression", "Linear SVM", "RBF SVM",
         "Decision Tree", "Random Forest", "Neural Net (Multi-layer perceptron)"]

classifiers = [
    LogisticRegression(),
    SVC(kernel="linear", probability=True, random_state=random_state),
    SVC(kernel='rbf', probability=True, random_state=random_state),
    DecisionTreeClassifier(max_depth=10),
    RandomForestClassifier(max_depth=10, n_estimators=50,random_state=random_state),
    MLPClassifier(max_iter=1000, random_state=random_state)]

ca_score = {} # Classification accuracy
F1_scores = {} #F1 scores

for name, clf in zip(names, classifiers):
    clf.fit(X_train_sc, y_train)
    ca_score[name] = clf.score(X_val_sc, y_val)
    F1_scores[name] = f1_score(y_val,clf.predict(X_val_sc),average='macro')
    
print('Classification performance on validation set:')
for clf in names:
    print ("{}, accuracy: {:.3f}, f1-score: {:.3f}".format(clf, ca_score[clf],F1_scores[clf]))



Classification performance on validation set:
Logistic Regression, accuracy: 0.866, f1-score: 0.858
Linear SVM, accuracy: 0.871, f1-score: 0.864
RBF SVM, accuracy: 0.862, f1-score: 0.855
Decision Tree, accuracy: 0.865, f1-score: 0.857
Random Forest, accuracy: 0.870, f1-score: 0.863
Neural Net (Multi-layer perceptron), accuracy: 0.895, f1-score: 0.888


In [34]:
# classifiers=[RandomForestClassifier(max_depth=10, n_estimators=50,random_state=random_state),
# MLPClassifier(max_iter=1000, random_state=random_state)]

In [35]:
# clf=classifiers[0]
# clf.fit(X_train_sc, y_train)
# misclassified=np.where((y_val-clf.predict(X_val_sc))!=0)
# cor_classified=np.where((y_val-clf.predict(X_val_sc))==0 & (y_val-clf.predict(X_val_sc))!= NaN)

In [36]:
# misclassified[0][2]

In [37]:
# misclassified_random_for_df=X_val.iloc[misclassified[0],:]

# misclassified_random_for_df['CLASS_SDSS']=y_val.iloc[misclassified]

# correctly_classified_df=X_val.iloc[cor_classified[0], :]
# correctly_classified_df['CLASS_SDSS'] = y_val.iloc[cor_classified]


#Cross Validated Backwards Elimination using SVM




In [38]:

# import matplotlib.pyplot as plt
# from sklearn.svm import SVC
# from sklearn.model_selection import StratifiedKFold
# from sklearn.feature_selection import RFECV
# from sklearn.datasets import make_classification

# # Build a classification task using 3 informative features
# X, y = make_classification(n_samples=1000, n_features=25, n_informative=3,
#                            n_redundant=2, n_repeated=0, n_classes=8,
#                            n_clusters_per_class=1, random_state=0)

# Create the RFE object and compute a cross-validated score.
svc = SVC(kernel="linear")
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2),
              scoring='accuracy')
rfecv.fit(X_train_sc, y_train)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

KeyboardInterrupt: 

In [None]:
rfecv.ranking_

In [None]:
#Get indices of most important variables
ind_list=[]
for i in range(len(X_train.columns)):
    if rfecv.ranking_[i] == 1:
        ind_list.append(i)
        

In [None]:
bwe_data_train=X_train_sc[:,ind_list]
bwe_data_val=X_val_sc[:,ind_list]

In [40]:
#Isomap


In [41]:
def scatter_2d_label(X_2d, y, s=2, alpha=0.5, lw=2):
    """Visualuse a 2D embedding with corresponding labels.
    
    X_2d : ndarray, shape (n_samples,2)
        Low-dimensional feature representation.
    
    y : ndarray, shape (n_samples,)
        Labels corresponding to the entries in X_2d.
        
    s : float
        Marker size for scatter plot.
    
    alpha : float
        Transparency for scatter plot.
        
    lw : float
        Linewidth for scatter plot.
    """
    targets = np.unique(y)
    colors = sns.color_palette(n_colors=targets.size)
    for color, target in zip(colors, targets):
        plt.scatter(X_2d[y == target, 0], X_2d[y == target, 1], color=color, label=target, s=s, alpha=alpha, lw=lw)

In [None]:
# from sklearn.manifold import Isomap
sns.set(font_scale=1.5) # Set default font size
fig, ax = plt.subplots(3,2,figsize=(12,14))
for ii, n_neighbors in enumerate([50, 100]):
    ismp = Isomap(n_components=2, n_neighbors=n_neighbors)
    X_ismp_2d = ismp.fit_transform(X_train_sc)
    plt.subplot(3,2,ii+1)
    scatter_2d_label(X_ismp_2d, y_train)
    plt.title('{} neighbours'.format(n_neighbors))
    plt.xlabel('Component 1')
    plt.ylabel('Component 2 ')
plt.legend(loc='center left', bbox_to_anchor=[1.01, 1.5], scatterpoints=3)
fig.tight_layout()
plt.show()