In [188]:
import pandas as pd
from datetime import datetime
import pandasql
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans

In [189]:
df = pd.read_csv("logins.txt", sep="\t", header=None)
df[["First Letter", "Name"]] = df[2].str.split(".", expand=True)

def normalize(seq):
    min_val = seq.min()
    max_val = seq.max()
    normalized_seq = (seq - min_val) / (max_val - min_val)
    return normalized_seq

df

Unnamed: 0,0,1,2,3,First Letter,Name
0,2021-01-01,00:00:00,s.matta,OUT,s,matta
1,2021-01-01,00:05:00,s.paige,OUT,s,paige
2,2021-01-01,01:52:00,t.goldschmidt,OUT,t,goldschmidt
3,2021-01-01,01:54:00,t.hardin,OUT,t,hardin
4,2021-01-01,02:40:00,c.elson,OUT,c,elson
...,...,...,...,...,...,...
3771147,2022-03-18,00:53:00,m.ponds,OUT,m,ponds
3771148,2022-03-18,16:54:00,m.ponds,IN,m,ponds
3771149,2022-03-19,00:57:00,m.ponds,OUT,m,ponds
3771150,2022-03-22,16:54:00,m.ponds,IN,m,ponds


In [190]:
def time2num(time: str):
    hours, minutes, seconds = map(int, time.split(":"))
    return hours * 3600 + minutes * 60 + seconds


try:
    df[1] = df[1].apply(time2num)
    # df[1] = normalize(df[1])
except:
    print("Already converted")


# df[1] = df[1].apply(time2num)



# for i in range(5):
#     day = df.loc[(df[0] == df[0][0]) & (df[3] == "IN")]
#     histogram,_ = np.histogram(df[1],bins=256)
#     plt.hist(histogram,bins=256)
#     plt.show()
# histogram

In [191]:
df.loc[df[0] == df[0][0]]

Unnamed: 0,0,1,2,3,First Letter,Name
0,2021-01-01,0.000000,s.matta,OUT,s,matta
1,2021-01-01,0.003475,s.paige,OUT,s,paige
2,2021-01-01,0.077832,t.goldschmidt,OUT,t,goldschmidt
3,2021-01-01,0.079222,t.hardin,OUT,t,hardin
4,2021-01-01,0.111188,c.elson,OUT,c,elson
...,...,...,...,...,...,...
93,2021-01-01,0.747047,t.hardin,IN,t,hardin
94,2021-01-01,0.748436,t.goldschmidt,IN,t,goldschmidt
95,2021-01-01,0.775539,c.elson,IN,c,elson
96,2021-01-01,0.789437,b.commodore,IN,b,commodore


In [192]:
query3 = """
SELECT [2], AVG([1]) as AvgColumn
FROM df
WHERE [3] = 'IN'
GROUP BY [2]
ORDER BY AVG([1]) DESC
"""

result = pandasql.sqldf(query3, locals())



In [193]:
result

Unnamed: 0,2,AvgColumn
0,r.lombardi,0.796525
1,m.then,0.796372
2,k.littlefield,0.796282
3,s.figueroa,0.796267
4,r.newton,0.796214
...,...,...
6274,j.shyne,0.246259
6275,m.stoll,0.246174
6276,j.99,0.246160
6277,c.rank,0.246146


In [194]:
clf = KMeans(n_clusters=5)


X = result['AvgColumn'].values.reshape(-1, 1)

clf.fit(X)
centers = clf.cluster_centers_
result['cluster'] = clf.labels_

westcluster = int(result.loc[result['2'] == 's.kinkel']['cluster'])
westerners = result.loc[result['cluster'] == westcluster]
westerners = westerners['2'].unique()

filtered_df = df[df[2].isin(westerners)]
filtered_df

  super()._check_params_vs_input(X, default_n_init=10)
  westcluster = int(result.loc[result['2'] == 's.kinkel']['cluster'])


Unnamed: 0,0,1,2,3,First Letter,Name
0,2021-01-01,0.000000,s.matta,OUT,s,matta
1,2021-01-01,0.003475,s.paige,OUT,s,paige
75,2021-01-01,0.667130,s.matta,IN,s,matta
78,2021-01-01,0.669910,s.paige,IN,s,paige
98,2021-01-02,0.000000,e.strange,OUT,e,strange
...,...,...,...,...,...,...
3771129,2022-02-24,0.042391,j.bruner,OUT,j,bruner
3771134,2022-03-06,0.711605,j.bruner,IN,j,bruner
3771135,2022-03-07,0.045170,j.bruner,OUT,j,bruner
3771136,2022-03-10,0.711605,j.bruner,IN,j,bruner


In [195]:
filtered_df.loc[filtered_df[0] == "2022-03-10"]

Unnamed: 0,0,1,2,3,First Letter,Name
3771136,2022-03-10,0.711605,j.bruner,IN,j,bruner


In [196]:
queryf = """
SELECT [0],AVG([1]),COUNT(1)
FROM filtered_df
WHERE [3] = 'IN'
GROUP BY [0]
ORDER BY AVG([1]) DESC
LIMIT 30
"""

final = pandasql.sqldf(queryf, locals())
final

Unnamed: 0,0,AVG([1]),COUNT(1)
0,2021-11-25,0.737665,2
1,2021-10-29,0.724288,4
2,2021-01-18,0.716073,7
3,2021-11-26,0.714385,3
4,2022-03-10,0.711605,1
5,2022-03-06,0.711605,1
6,2022-02-23,0.711605,1
7,2022-02-21,0.711605,1
8,2022-02-14,0.711605,1
9,2022-01-30,0.711605,1


In [197]:
result["1"].describe

iqr = result["1"].quantile(0.75) - result["1"].quantile(0.25)

low,high = result["1"].min()-1.5*iqr,result["1"].max()+1.5*iqr

# low,high = result["1"].max(),result["1"].min()
result["1"].max()

KeyError: '1'

In [None]:
#Getting everybody in IQR range of s.kinkel. This is our proto-cluster
query2 = f"""
SELECT DISTINCT [2], [1], [0]
FROM df
WHERE [3] = 'IN' AND [1] BETWEEN {low} AND {high}
"""
result2 = pandasql.sqldf(query2, locals())
result2

Unnamed: 0,2,1,0
0,t.hardin,64500,2021-01-01
1,t.goldschmidt,64620,2021-01-01
2,b.21,62400,2021-01-02
3,k.borden,62460,2021-01-02
4,r.carter,62520,2021-01-02
...,...,...,...
84464,k.12,66120,2021-12-31
84465,s.kirkland,66180,2021-12-31
84466,d.butler,66180,2021-12-31
84467,j.63,66180,2021-12-31


In [None]:
query3 = """
SELECT [0], AVG([1]) as AvgColumn
FROM result2
GROUP BY [0]
ORDER BY AVG([1]) DESC
"""
result3 = pandasql.sqldf(query3, locals())
result3

Unnamed: 0,0,AvgColumn
0,2021-02-15,64770.000000
1,2021-01-01,64560.000000
2,2021-11-11,64260.000000
3,2021-06-11,64122.310757
4,2021-12-25,64105.714286
...,...,...
360,2021-01-18,63760.000000
361,2021-11-26,63720.000000
362,2021-05-31,63720.000000
363,2021-10-29,63060.000000


In [None]:
sorted_df = result3.sort_values(by='AvgColumn', ascending=False)
sorted_df['AvgColumn'].mean()


63945.74584262983

In [None]:
day0 = df.loc[df[0] == df[0][0]]

day0in = df.loc[df[3] == "IN"]
day0in[1] = normalize(day0in[1])


day1 = df.loc[df[0] == df[0][1]]

day1in = df.loc[df[3] == "IN"]
day1in[1] = normalize(day1in[1])



day1in

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  day0in[1] = normalize(day0in[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  day1in[1] = normalize(day1in[1])


Unnamed: 0,0,1,2,3,First Letter,Name
7,2021-01-01,0.291946,l.wood,IN,l,wood
8,2021-01-01,0.295302,d.bonita,IN,d,bonita
9,2021-01-01,0.298658,s.johnson,IN,s,johnson
10,2021-01-01,0.312081,l.luck,IN,l,luck
11,2021-01-01,0.313758,r.cogar,IN,r,cogar
...,...,...,...,...,...,...
3771142,2022-03-15,0.843121,m.ponds,IN,m,ponds
3771144,2022-03-16,0.843121,m.ponds,IN,m,ponds
3771146,2022-03-17,0.843121,m.ponds,IN,m,ponds
3771148,2022-03-18,0.843121,m.ponds,IN,m,ponds
