# Importing Data from Kaggle

https://www.kaggle.com/general/74235 
https://www.youtube.com/watch?v=57N1g8k2Hwc&ab_channel=ShriramVasudevan


Data importing

In [None]:
! kaggle competitions download -c "ieee-fraud-detection"

Downloading ieee-fraud-detection.zip to /content
 89% 105M/118M [00:04<00:00, 30.1MB/s] 
100% 118M/118M [00:04<00:00, 25.3MB/s]


In [None]:
!unzip ieee-fraud-detection.zip

Archive:  ieee-fraud-detection.zip
  inflating: sample_submission.csv   
  inflating: test_identity.csv       
  inflating: test_transaction.csv    
  inflating: train_identity.csv      
  inflating: train_transaction.csv   


# Importing Libs


In [None]:
import numpy as np
import pandas as pd

import math
import os
import random
import re
import sys

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier

import matplotlib.pyplot as plt
import matplotlib as mpl

# Code


#Identity Table

Variables in this table are identity information like 
* network connection information (IP, ISP, Proxy, etc)  
* digital signature (UA/browser/os/version, etc) associated with transactions.

They're collected by Vesta’s fraud protection system and digital security partners. (The field names are masked and pairwise dictionary will not be provided for privacy protection and contract agreement)

###Categorical Features:
* DeviceType
* DeviceInfo
* id_12 - id_38

In [None]:
##original data
data = pd.read_csv('/content/train_identity.csv',nrows=4000)

data.count()
len(data.columns)
col_labels= data.columns
data.head(3)

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0.0,70787.0,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows


In [None]:
print(col_labels)
data.ndim
data.shape

Index(['TransactionID', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06',
       'id_07', 'id_08', 'id_09', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14',
       'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22',
       'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30',
       'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
       'DeviceType', 'DeviceInfo'],
      dtype='object')


(4000, 41)

In [None]:
data.info(verbose=False,memory_usage="deep")
#data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Columns: 41 entries, TransactionID to DeviceInfo
dtypes: float64(23), int64(1), object(17)
memory usage: 4.4 MB


In [None]:
sample= pd.read_csv("/content/sample_submission.csv")
sample.head(1)

Unnamed: 0,TransactionID,isFraud
0,3663549,0.5


#Transaction Table
* TransactionDT: timedelta from a given reference datetime (not an actual timestamp)
* TransactionAMT: transaction payment amount in USD
* ProductCD: product code, the product for each transaction
* card1 - card6: payment card information, such as card type, card category, issue bank, country, etc.
* addr: address
* dist: distance
* P_ and (R__) emaildomain: purchaser and recipient email domain
* C1-C14: counting, such as how many addresses are found to be  associated with the payment card, etc. The actual meaning is masked.
* D1-D15: timedelta, such as days between previous transaction, etc.
* M1-M9: match, such as names on card and address, etc.
* Vxxx: Vesta engineered rich features, including ranking, counting, and other entity relations.

###Categorical Features:
* ProductCD
* card1 - card6
* addr1, addr2
* P_emaildomain
* R_emaildomain
* M1 - M9


In [None]:
trans= pd.read_csv("/content/train_transaction.csv",nrows=4000)
trans.head(11)
trans_labels=trans.columns
print(trans_labels)

Index(['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt',
       'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5',
       ...
       'V330', 'V331', 'V332', 'V333', 'V334', 'V335', 'V336', 'V337', 'V338',
       'V339'],
      dtype='object', length=394)


Replacing 'NaN' values with most frequent values for both dataframes

https://www.youtube.com/watch?v=8p6XaQSIFpY&ab_channel=sentdex 

In [None]:
dataset_1=data
dataset_2=trans

##adding label to dataset 1

dataset_1=dataset_1.merge(dataset_2[['TransactionID','isFraud']], how = 'left')



dataset_1 = dataset_1.fillna(dataset_1.mode().iloc[0])
dataset_2 = dataset_2.fillna(dataset_2.mode().iloc[0])


In [None]:
def coverting_data(df):
  header=df.columns.values
  tmp=df.columns.values
  for column in header:
    text_to_digit={}
    def covert_to_int(value):
      return text_to_digit[value]

    if df[column].dtype != np.int64 and df[column].dtype != np.float64:
      #print("non numbric col found",column)
      column_contents = df[column].values.tolist() #or use unique function and then set
    #  cl=df[column].unique()
      unique_ele=set(column_contents)
      #print(unique_ele,set(cl))
      x=0
      for unique_val in unique_ele:
        if unique_val not in text_to_digit:
          text_to_digit[unique_val] = x
          x+=1
      
      df[column] = list(map(covert_to_int,df[column]))
    

  return df


#h,t=
dataset_1=coverting_data(dataset_1)
dataset_2=coverting_data(dataset_2)

#print(h,t)
#dataframe


In [None]:
dataset_1['DeviceInfo'].unique()

array([221, 272, 263, 140,  94, 178, 131, 126,  70, 182, 154,  20, 152,
       249, 134, 151, 206, 164,  38,  52,  13, 175, 188, 145, 202,   2,
       264, 150,  28,  37,  68, 258,   1, 199, 232,  86,  50, 160,  24,
       129, 125, 243, 204, 161, 200, 159, 225, 142,  96, 177,   4, 170,
        31, 197, 100, 276, 146, 233, 191,  27,  39, 210, 227,   9,  21,
        33,  35,  80,  49, 187,  46,  89,  42, 259, 214, 162, 281, 156,
       180,  56, 265,  15,  62, 255, 121,  92,  41, 271, 132, 123, 228,
       234, 171, 266, 277, 270, 174, 253,  53,  36,  87,  32, 153, 239,
        72, 172, 166, 107, 117, 148,  65,  98, 201, 212, 223, 157, 235,
       242, 183, 109,  61,  34,  11, 108,  66, 119,   6,  51,  88,  54,
       147, 236,  12, 280,  71, 155,  23, 229, 262, 192, 110, 222, 219,
       273,  60,  22,  63, 158, 105,  73, 103, 127, 230,  74,  75, 267,
       257,  95, 165, 113, 115, 144, 279, 275, 135,  16, 173, 143, 122,
        57, 101, 261,   3, 167,  69, 141,  78,  43, 224, 208,  7

Merging the 2 dataframe in one with respect to **"TransationID"**

In [None]:
dataset_3 = pd.merge(dataset_2, dataset_1, how='left', on = ['TransactionID','isFraud'])
dataset_3.shape

(4000, 434)

In [None]:

dataset_3 = dataset_3.fillna(dataset_3.mode().iloc[0])
dataset_3.head(3)


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,2,13926,194.0,150.0,1,142.0,...,7.0,24.0,46.0,2.0,0.0,0.0,0.0,0.0,0.0,263.0
1,2987001,0,86401,29.0,2,2755,404.0,150.0,0,102.0,...,7.0,24.0,46.0,2.0,0.0,0.0,0.0,0.0,0.0,263.0
2,2987002,0,86469,59.0,2,4663,490.0,150.0,3,166.0,...,7.0,24.0,46.0,2.0,0.0,0.0,0.0,0.0,0.0,263.0


## Note

Model would be tested by providing a 'ID' and for that 'ID' column fraud would be given.

In [None]:
#trans_dataset_labels=trans['isFraud']
#combined_dataset_labels=df_merged_table['isFraud']
#data_with_label= df_merged_table

In [None]:
dataset_2.head(12)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,2,13926,194.0,150.0,1,142.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2987001,0,86401,29.0,2,2755,404.0,150.0,0,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2987002,0,86469,59.0,2,4663,490.0,150.0,3,166.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2987003,0,86499,50.0,2,18132,567.0,150.0,0,117.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2987004,0,86506,50.0,1,4497,514.0,150.0,0,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2987005,0,86510,49.0,2,5937,555.0,150.0,3,226.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2987006,0,86522,159.0,2,12308,360.0,150.0,3,166.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2987007,0,86529,422.5,2,12695,490.0,150.0,3,226.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2987008,0,86535,15.0,1,2803,100.0,150.0,3,226.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2987009,0,86536,117.0,2,17399,111.0,150.0,0,224.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
dataset_3.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,2,13926,194.0,150.0,1,142.0,...,7.0,24.0,46.0,2.0,0.0,0.0,0.0,0.0,0.0,263.0
1,2987001,0,86401,29.0,2,2755,404.0,150.0,0,102.0,...,7.0,24.0,46.0,2.0,0.0,0.0,0.0,0.0,0.0,263.0


# Class distribution

In [None]:
print(dataset_1['isFraud'].value_counts())
print(dataset_2['isFraud'].value_counts())
print(dataset_3['isFraud'].value_counts())

0.0    3967
1.0      33
Name: isFraud, dtype: int64
0    3917
1      83
Name: isFraud, dtype: int64
0    3917
1      83
Name: isFraud, dtype: int64


# Trying out SVM

In [None]:
# from sklearn.model_selection import train_test_split
def data_splitting(dataset,testset_size=0.2,randomState=1):

  ##seperating labels for data 
  X= dataset.drop('isFraud', axis = 1).values
  y= dataset['isFraud'].values

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
  #training_set, test_set = train_test_split(dataset, test_size = testset_size, random_state = randomState)
  #print(('X_train samples : '), X_train[:1])
  #print(('X_test samples : '), X_test[0:1])
  #print(('y_train samples : '), y_train[0:20])
  #print(('y_test samples : '), y_test[0:20])
  
  return X_train, X_test, y_train, y_test

def confusion_matrix_function(Ytest,yPred):
  #from sklearn.metrics import confusion_matrix
  cm = confusion_matrix(Ytest,yPred)
  accuracy = float(cm.diagonal().sum())/len(Ytest)
  print("\nAccuracy Of SVM For The Given Dataset : ", accuracy)



In [None]:
#X is the dataset and y is the labels
X_train, X_test, y_train, y_test= data_splitting(dataset_1)

In [None]:
from sklearn.svm import SVC
result={}

classifier = SVC(kernel="linear", random_state = 1,gamma='auto')
classifier.fit(X_train,y_train)
print('fit done')
result_=classifier.predict(X_test)
print('predict done')


In [None]:
X_train, X_test, y_train, y_test= data_splitting(dataset_2)
classifier = SVC(kernel='linear', random_state = 1,gamma='auto')
classifier.fit(X_train,y_train)
print('fit done')
result_2=classifier.predict(X_test)
print('predict done')
X_train, X_test, y_train, y_test= data_splitting(dataset_3)
classifier = SVC(kernel='linear', random_state = 1,gamma='auto')
classifier.fit(X_train,y_train)
print('fit done')
result_2=classifier.predict(X_test)
print('predict done')


