In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns


import pickle


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample

import warnings
warnings.filterwarnings('ignore')


In [36]:
df = pd.read_excel("synthetic_api_secrets_dataset.xlsx")

In [37]:
df.head()

Unnamed: 0,value,provider,category,risk_level
0,sk-openai-m2GBnkce87NQoMHkfUk7lutCLdgRxjYh,OpenAI,AI_API_KEY,HIGH
1,eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.uwhXVrJYa...,Generic,JWT,HIGH
2,ghp_utWcK1DTWyd5eQhNZOx8kvgudGHDus,GitHub,SERVICE_TOKEN,HIGH
3,ajyE4Hfm-jSFa-z4ZL-VUFP-6CU4tCSrCN4D,Generic,UUID_FALSE_POSITIVE,NONE
4,AKIACQM2APML03FLHBI0,AWS,CLOUD_ACCESS_KEY,CRITICAL


In [38]:
df_cpy = df.copy()

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   value       1000 non-null   object
 1   provider    1000 non-null   object
 2   category    1000 non-null   object
 3   risk_level  1000 non-null   object
dtypes: object(4)
memory usage: 31.4+ KB


In [40]:
df['provider'].value_counts()

provider
Generic       207
AWS           194
GitHub        117
Slack         116
Groq          101
PostgreSQL     98
MongoDB        86
OpenAI         81
Name: count, dtype: int64

In [41]:
df['category'].value_counts()

category
SERVICE_TOKEN          233
DATABASE_URI           184
AI_API_KEY             182
UUID_FALSE_POSITIVE    106
CLOUD_ACCESS_KEY       104
JWT                    101
CLOUD_SECRET_KEY        90
Name: count, dtype: int64

In [42]:
encoder = OneHotEncoder(handle_unknown='ignore')

In [43]:
encoded = encoder.fit_transform(df[['provider', 'category']]).toarray()

In [44]:
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['provider', 'category']))

In [45]:
df = df.drop(columns=['provider', 'category']).join(encoded_df)

In [46]:
df.head()

Unnamed: 0,value,risk_level,provider_AWS,provider_Generic,provider_GitHub,provider_Groq,provider_MongoDB,provider_OpenAI,provider_PostgreSQL,provider_Slack,category_AI_API_KEY,category_CLOUD_ACCESS_KEY,category_CLOUD_SECRET_KEY,category_DATABASE_URI,category_JWT,category_SERVICE_TOKEN,category_UUID_FALSE_POSITIVE
0,sk-openai-m2GBnkce87NQoMHkfUk7lutCLdgRxjYh,HIGH,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.uwhXVrJYa...,HIGH,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,ghp_utWcK1DTWyd5eQhNZOx8kvgudGHDus,HIGH,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,ajyE4Hfm-jSFa-z4ZL-VUFP-6CU4tCSrCN4D,NONE,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,AKIACQM2APML03FLHBI0,CRITICAL,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [47]:
df['risk_level'].unique()

array(['HIGH', 'NONE', 'CRITICAL'], dtype=object)

In [48]:
df['risk_level'] = df['risk_level'].str.lower()

In [49]:
df['risk_level'] = df['risk_level'].map({'high' : 1, 'none' : 0, 'critical' : 2})

In [50]:
vectorizer = TfidfVectorizer( analyzer='char', ngram_range=(3,6))

In [51]:
df.columns

Index(['value', 'risk_level', 'provider_AWS', 'provider_Generic',
       'provider_GitHub', 'provider_Groq', 'provider_MongoDB',
       'provider_OpenAI', 'provider_PostgreSQL', 'provider_Slack',
       'category_AI_API_KEY', 'category_CLOUD_ACCESS_KEY',
       'category_CLOUD_SECRET_KEY', 'category_DATABASE_URI', 'category_JWT',
       'category_SERVICE_TOKEN', 'category_UUID_FALSE_POSITIVE'],
      dtype='object')

In [52]:
X = df['value']
Y = df['risk_level']

In [53]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [54]:
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [55]:
model1 = LogisticRegression()

In [56]:
model1.fit(X_train_vec, Y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [57]:
model1.score(X_train_vec, Y_train)

0.895

In [58]:
model1.score(X_test_vec, Y_test)

0.81

In [59]:
model3 = LogisticRegression()
params = {'penalty' : ['l1', 'l2', 'elasticnet', None],
         'dual' : [True, False],
         'fit_intercept' : [True, False],
         'solver' : ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
         'multi_class' : ['auto', 'ovr', 'multinomial']
         }

In [60]:
grid = GridSearchCV(model3, params, n_jobs=-1)

In [61]:
grid.fit(X_train_vec, Y_train)

0,1,2
,estimator,LogisticRegression()
,param_grid,"{'dual': [True, False], 'fit_intercept': [True, False], 'multi_class': ['auto', 'ovr', ...], 'penalty': ['l1', 'l2', ...], ...}"
,scoring,
,n_jobs,-1
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,False
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [62]:
grid.best_estimator_

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,False
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [63]:
grid.best_score_

np.float64(0.90625)

In [64]:
grid.best_params_

{'dual': False,
 'fit_intercept': False,
 'multi_class': 'auto',
 'penalty': 'l1',
 'solver': 'liblinear'}

In [65]:
text = 'AKIACQM2APML03FLHBI0'
# Fit the vectorizer if the kernel was restarted and it is not fitted yet
if not hasattr(vectorizer, "idf_"):
    vectorizer.fit(X_train)
textv = vectorizer.transform([text])
result = grid.predict(textv)
if result[0] == 1:
    print("High Risk")
elif result[0] == 2:
    print("Critical")
elif result[0] == 0:
    print("No Risk")

Critical


In [66]:
with open("model3.pkl", "wb") as f:
    pickle.dump(grid, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)
    

In [69]:
text = 'AKIACQM2APML03FLHBI0'
textv = vectorizer.transform([text]).toarray()
result = grid.predict(textv)
if result[0] == 1:
    print("High Risk")
elif result[0] == 2:
    print("Critical")
elif result[0] == 0:
    print("No Risk")


Critical
