In [None]:
### Import packages

import pandas as pd
import numpy as np

# location of input and output files
path = r'C:\Users\visha\Downloads'

data_file = path + r'\ad.data'
header_file = path + r'\ad.names'

### Read the scoring data

df = pd.read_csv(data_file, header=None)

# Note that the target column will not be available during an actual scoring process
# But since we are reading the same dataset that we had used for trainnig, we we have the target column
# We will discard this column from the dataset
df = df.iloc[:, :-1]

print ('Input file:', df.shape)

### Header

header_text = open(header_file, "r")

lines = header_text.read().split('\n')

header = []

for line in lines[4:-1]:
    
    if line[0] not in ('|', ''):
        header.append(line.split(':')[0])

df.columns = header

### Type conversion

df = df.apply(pd.to_numeric, errors='coerce')

### Missing Value Imputation

df['width'] = df['width'].fillna(110.000)

### Transformations

df['width_log'] = [np.log(x) for x in df['width']]

df.drop('width', axis=1, inplace=True)

### Standardization

df['width_log'] = [(x -  4.737299159267481 ) / 0.7186450774456629 for x in df['width_log'] ]
df['ancurl*http+www'] = [(x -  0.028146989835809225 ) / 0.16545736952878795 for x in df['ancurl*http+www'] ]
df['ancurl*click'] = [(x -  0.05238467552775606 ) / 0.2228886799414486 for x in df['ancurl*click'] ]
df['ancurl*adclick'] = [(x -  0.0109460516028147 ) / 0.10408989763638597 for x in df['ancurl*adclick'] ]
df['url*ads+media'] = [(x -  0.009382329945269743 ) / 0.09644466997564541 for x in df['url*ads+media'] ]
df['url*doubleclick.net'] = [(x -  0.007818608287724784 ) / 0.08811099628298527 for x in df['url*doubleclick.net'] ]
df['ancurl*redirect'] = [(x -  0.04300234558248632 ) / 0.2029417282495575 for x in df['ancurl*redirect'] ]
df['ancurl*click+profileid'] = [(x -  0.018764659890539485 ) / 0.13574591980549267 for x in df['ancurl*click+profileid'] ]
df['ancurl*groupid'] = [(x -  0.018764659890539485 ) / 0.13574591980549292 for x in df['ancurl*groupid'] ]
df['ancurl*url'] = [(x -  0.009382329945269743 ) / 0.09644466997564581 for x in df['ancurl*url'] ]
df['ancurl*nph'] = [(x -  0.008600469116497263 ) / 0.09237517415968315 for x in df['ancurl*nph'] ]
df['origurl*bin'] = [(x -  0.01641907740422205 ) / 0.12713035764946684 for x in df['origurl*bin'] ]
df['origurl*home.netscape.com'] = [(x -  0.00547302580140735 ) / 0.07380603528569575 for x in df['origurl*home.netscape.com'] ]

### Apply the scoring algorithm

xbeta = df['width_log'] * 1.2151395008340888 + \
            df['ancurl*http+www'] * 1.0148186614592152 + \
            df['ancurl*click'] * 0.9909770949840874 + \
            df['ancurl*adclick'] * 0.8880357853114902 + \
            df['url*ads+media'] * 0.8169338209836402 + \
            df['url*doubleclick.net'] * 0.6586832862099736 + \
            df['ancurl*redirect'] * 0.8580941709622609 + \
            df['ancurl*click+profileid'] * -0.7312487336771645 + \
            df['ancurl*groupid'] * 0.635988245254226 + \
            df['ancurl*url'] * 0.4173900515959437 + \
            df['ancurl*nph'] * 0.6742430652510202 + \
            df['origurl*bin'] * 0.5088969111274405 + \
            df['origurl*home.netscape.com'] * 0.5777648569842253 + \
            -2.748908581570469

### Calculate the probabilities

df['score'] = 1 / (1 + np.exp(-xbeta))

print (df['score'].describe())

df.to_csv(path + r'/scored_df.csv')

  interactivity=interactivity, compiler=compiler, result=result)


Input file: (3279, 1558)
count    3279.000000
mean        0.127665
std         0.283059
min         0.000008
25%         0.016393
50%         0.021277
75%         0.045497
max         1.000000
Name: score, dtype: float64
