In [1]:
import numpy as np

import pandas as pd

from zse.collections import *
from zse.utilities import *

from ase.io import read, write
from matplotlib import cm
from matplotlib import pyplot as plt
from collections import Counter

from sklearn.cluster import KMeans

from collections import defaultdict

from sklearn.metrics import silhouette_samples, silhouette_score, completeness_score, homogeneity_score, mean_squared_error
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

from sklearn.preprocessing import StandardScaler

import itertools
from itertools import permutations

from sklearn.preprocessing import MinMaxScaler
import sys

from sklearn.neighbors import NearestNeighbors

from kneed import KneeLocator


In [2]:
def get_fw_data(file,code):
    
    # file is the file we want to read from
    # code is the IZA framework code you want data on
    
    file = open(file,'r')
    data = file.readlines()
    file.close()
    
    for i,line in enumerate(data):
        fields = line.split()
        if fields[0] == code:
            start = i
            break
    
    # go through fw chunk and get data
    
    t_sites = {}
    o_sites = {}
    for i,line in enumerate(data[start+1:]):
        line = line.rstrip(' \n')

        # get the t site rings
        
        if 'T' in line and ':' in line:
            fields = line.split(':')
            t_sites[fields[0]]=fields[1]                
        
        # get the o site rings
        
        if 'O' in line and ':' in line:
            fields = line.split(':')
            o_sites[fields[0]]=fields[1]        
        
        fields = line.split()
        if fields[0] == 'Framework':
            end = i+start-1
            break

    return(t_sites,o_sites)    

def numofrings(file,code):
    #might already be a function called counter so come up with very unique names otherwsie function overrides it -- isseue with kernel resets
    '''takes in the file and code name and returns how many of each ring type is present
    arguments: file: dataset
                code: which zeolite is being observed
    returns: number of each rings inside'''
    
    #call function to get data
    tsites, osites = get_fw_data(file,code)
    
    for t in tsites:
        #print('check 1')
        #resets dictionary each tsite
        tsite_dict = {1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,9:0,10:0,11:0,12:0}
        #ways to make dictionary without knowing what it is 
        
        #make into vector not dictionary 
        
        #restarts j to 1 to check for ring size
        j = 1
        #go through all of the one tsite
        for i in range(0,len(tsites[t])-2,2):
            #print('check 2')
            #set yes to true
            print('i is',i)
            yes = True
            while yes:
                #print('check 3')
                #print(tsites[t][i])
                #print(int(tsites[t][i]) == j)
                #print(tsite_dict[j])
                #number of rings matches dictionary component
                if int(tsites[t][i]) == j:
                    print('check 4')
                    tsite_dict[j] = tsite_dict[j] + 1
                    #end while loop 
                    yes = False
                #increase j 
                else:
                    
                    j = j + 1
                    print('j is',j)
        print(t,tsite_dict)
        

def nringsvector(code,file):
    '''takes in the file and code name and returns how many of each ring type is present
    arguments: file: dataset
                code: which zeolite is being observed
    returns: vector of number of rings for each tsite'''
    
    #get site data
    tsites, osites = get_fw_data(file,code)
    
    #create empty vector
    A = np.zeros([len(tsites),21])
    
    #coutner for which row of the matrix you are in
    k=0 
    
    #go through each tsite
    for t in tsites:
        #initialize accumulator 
        j=1
        
        #make list of number of rings
        tsites_list = tsites[t].split('_')
        #go through the string for each t list
        for i in range(0,len(tsites_list)):
            #make the while loop true each for loop
            yes = True
            while yes:
                if int(tsites_list[i]) == j:
                    A[k][j] = A[k][j] + 1
                    #end while loop 
                    yes = False
                #increase j 
                else:
                    j = j + 1
        #increase row
        k = k +1 
    return A

def assemblematrix(file):
    fws = get_all_fws()
    fws = fws[1:]
    B = nringsvector('ABW',file)
    for code in fws:
        C = nringsvector(code,file)
        B = np.append(B, C, axis=0)
    return B

def kmeans_percent(file,ncluster):
    A = assemblematrix(file)
    clustering = KMeans(n_clusters = ncluster).fit(A)
    cluster = clustering.labels_

    sorted_clusters = defaultdict(list) #inside parentheses put the type of value ie list int

    fws = get_all_fws()
    #testing to see if Tsite make it a similar cluster
    dict_fws = {}
    count = 0
    for code in fws:
        dict_fws[code] = []
        tsites, osites = get_fw_data(file,code)
        for t in tsites:
            dict_fws[code].append(cluster[count])
            count = count + 1
    allsame = 0
    no1 = 0
    for k in dict_fws:
        if len(dict_fws[k]) > 1:
            no1 = no1 + 1
            avg = sum(dict_fws[k])/len(dict_fws[k])
            if avg == dict_fws[k][0]:
                if dict_fws[k][0] == dict_fws[k][len(dict_fws[k])-1]:
                    allsame = allsame + 1
    per = allsame/no1
    return per, dict_fws 

def weight_avg(file):
    '''Takes in a files frameworks and makes a wieghted average for each framewokr
        Arguments: file - whichever defintion of ring used
        returns: an array of the values'''
    fws = get_all_fws()
    fws_weights = np.zeros([len(fws),21])
    count = 0
    for code in fws:
        #get tsite sata
        tsites, osites = get_fw_data(file,code)
        ts, tm, ti = get_tsites(code)
        #number of tsites
        n = len(tsites)
        #get the matrix of the tsite vectors
        rings = nringsvector(code,file)
        temp = np.zeros(len(rings[0]))
        #to move through the rings matrix
        ring_count = 0
        #calculate weights and add them rows of tsite values together
        for s,m,i in zip(ts,tm,ti):
            #calculate weight
            w = m/sum(tm)
            #make weighted framework
            temp = w*rings[ring_count] + temp
            ring_count = ring_count + 1
        fws_weights[count] = temp
        #increase row number
        count = count + 1
    return fws_weights

def compare4(A,B,nclusters,rand_state,vbool=False):
    #get data 
    Azlist, Aist = Kmeansresults(A,nclusters,rand_state,False)
    Bzlist, Bist = Kmeansresults(B,nclusters,rand_state,False)

    #print(Aist)
    #print(Bist)
    #make default dictionaries to reorganzie the zlist
    Alist = defaultdict(list)
    Blist = defaultdict(list)
    
    #sort the dictionaries numerically
    m = 0
    for k in (sorted(Azlist, key=lambda k: len(Azlist[k]), reverse=True)):
        Alist[m] = Azlist[k]
        m = m + 1
    m = 0
    for k in sorted(Bzlist, key=lambda k: len(Bzlist[k]), reverse=True):
        Blist[m] = Bzlist[k]
        m = m + 1   
    # combinations
    unique_combinations = []
 
    # Getting all permutations of list_1
    # with length of list_2
    permut = itertools.permutations(Alist, len(Blist))
    # zip() is called to pair each permutation
    # and shorter list element into combination
    for comb in permut:
        zipped = zip(comb, Blist)
        unique_combinations.append(list(zipped))
       
    
    #create an array to store values
    array_values = np.zeros([nclusters,nclusters])
    #print('')
    #loop over unique combinations 
    for n in range(nclusters):
        for i in range(nclusters):
            #index into the first part of the ordered pair
            a = unique_combinations[n][i][0]
            #index into the second part of the ordered pair
            b = unique_combinations[n][i][1]
            #find what is in common with the two lists 
            c = list(set(Alist[a]).intersection(Blist[b]))
            #add the value to the array
            array_values[n,i] = len(c)
    #print(array_values)
    #print('')
    
    #store results 
    temp = []
    #print results 
    for t in range(nclusters):
        #print(sum(array_values[t])/253)
        temp.append(sum(array_values[t])/253)
        
    return temp

def isNaN(num):
    return num!= num

def properties(A,values,nclusters,vbool=False):
    '''Arguments:
    
        A: the assembled matrix
        nclusters: number of clusters
        values: data framework to be texted'''
    #get all the frameworks
    fws = get_all_fws()
    #get the clusters
    Azlist, Aist = Kmeansresults(A,nclusters,vbool)
    print(Aist)
    #create a values dictionary to store the frameworks IZa values
    value_dict = defaultdict(list)
    #go through all the clusters
    for k in Azlist:
        #go through all the frameworks in each cluster
        for i in range(len(Azlist[k])):
            #find the index for the framework
            idx = fws.index(Azlist[k][i])
            if isNaN(values[idx]) != True:
                #add the framework's values to the dictionary
                value_dict[k].append(int(values[idx]))
    
    
    #list of the averages
    avg = []
    #list of the standard deviations
    stdev = []
    for k in value_dict:
        x = sum(value_dict[k])/len(value_dict[k])
        avg.append(x)
        y = np.std(value_dict[k])
        stdev.append(y)
                   
    return avg, stdev

def Kmeansresults(A,n,minusBSV = False):
    
    fws = get_all_fws()
    if minusBSV == True:
        fws = np.concatenate((fws[0:39],fws[40:]))
    clustering = KMeans(n,random_state=9).fit(A)
    cluster = clustering.labels_

    zlist = defaultdict(list)
    for i in range(len(cluster)):
        zlist[cluster[i]].append(fws[i])
    llist = []
    for i in range(n):
        llist.append(len(zlist[i]))
    #llist.sort()
    
    return zlist, llist


In [None]:
A = weight_avg('../Data/vertex_rings.txt')
B = weight_avg('../Data/sastre_rings.txt')
C = weight_avg('../Data/crum_rings.txt')
D = weight_avg('../Data/goetzke_rings.txt')


scaler = MinMaxScaler()
# transform data
scaledA = scaler.fit_transform(A)
scaledB = scaler.fit_transform(B)
scaledC = scaler.fit_transform(C)
scaledD = scaler.fit_transform(D)

In [None]:
x = 
for n in range(2,500,)
for i in range(2,60):
    clustering = KMeans(i,rand_state = ).fit(scaledA)
    cluster = clustering.labels_
    silhouette_score(C,cluster))