In [1]:
from sklearn import datasets
import numpy as np
iris = datasets.load_iris()
X = iris.data[:, [2, 3]]
y = iris.target
#print('Class labels:',

In [7]:
np.unique(y)

array([0, 1, 2])

In [14]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1,stratify=y)

In [19]:
##Standardising the features
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)


In [15]:
print(np.bincount(y_test))

[15 15 15]


In [22]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=100.0,random_state=1,solver='lbfgs',multi_class='ovr') ##C inverse regularization parameter,weight coefficient shrink when we decreae C i.e. increase the regularisation
lr.fit(X_train_std,y_train)


In [27]:
y_pred=lr.predict(X_test_std)

In [26]:
from sklearn.metrics import accuracy_score

In [28]:
accuracy_score(y_pred,y_test)

0.9777777777777777

In [29]:
from sklearn.svm import SVC
svml = SVC(kernel='linear',C=1,random_state=1)

In [31]:
svml.fit(X_train_std,y_train)

In [34]:
svml_pred=svml.predict(X_test_std)

In [35]:
accuracy_score(svml_pred,y_test)

0.9777777777777777

In [39]:
from sklearn.svm import SVC
svml = SVC(kernel='rbf',C=1,random_state=1,gamma=0.1)

In [38]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='gini',max_depth=4,random_state=1)


In [41]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(criterion = 'gini', n_estimators=25,n_jobs=2,random_state=1)

In [45]:
from sklearn.neighbors import KNeighborsClassifier
neighbors = KNeighborsClassifier(n_neighbors=5,p=2) ### p=2 Euclidian distance

In [142]:
##Preprocessing

In [144]:
import pandas as pd
from io import StringIO


In [146]:
csv = \
    '''
    A,B,C,D
    1.0,2.0,3.0,4.0
    5.0,6.0,,8.0
    10.0,11.0,12.0
    '''

df = pd.read_csv(StringIO(csv))

In [148]:
df.isna()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,False,False,True


In [151]:
df.isnull().sum()

    A    0
B        0
C        1
D        1
dtype: int64

In [152]:
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

In [155]:
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [154]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [156]:
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [157]:
# drop rows that have fewer than 3 real values 

df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [158]:
# only drop rows where NaN appear in specific columns (here: 'C')

df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


In [163]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


In [3]:
##for categorical data
import pandas as pd

df = pd.DataFrame([['green', 'M', 10.1, 'class2'],
                   ['red', 'L', 13.5, 'class1'],
                   ['blue', 'XL', 15.3, 'class2']])

df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


##Mapping ordinal features

In [4]:
size_mapping = {'XL': 3,
                'L': 2,
                'M': 1}

df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [5]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

In [175]:
inv_size_mapping

{3: 'XL', 2: 'L', 1: 'M'}

##Encoding labelss

In [190]:
np.unique(df['classlabel'])

array(['class1', 'class2'], dtype=object)

In [7]:
import numpy as np
class_mapping = {val:idx for idx,val in enumerate(np.unique(df['classlabel']))}
df['classlabel'] = df['classlabel'].map(class_mapping)
df['classlabel']

0    1
1    0
2    1
Name: classlabel, dtype: int64

In [194]:
class_mapping

{'class1': 0, 'class2': 1}

In [204]:
inv_class_mapping = {val:idx for idx,val in class_mapping.items()}
df['classlabel']=df['classlabel'].map(inv_class_mapping)

In [209]:
##above can also be done using LabelEncoder from sklearn
from sklearn.preprocessing import LabelEncoder
lb= LabelEncoder()
df['classlabel'] = lb.fit_transform(df['classlabel'].values)


In [210]:
df['classlabel']

0    1
1    0
2    1
Name: classlabel, dtype: int32

In [213]:
lb.inverse_transform(df['classlabel'])

array(['class2', 'class1', 'class2'], dtype=object)

## OHE for Nominal features

In [221]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


In [244]:
X= df[['color','size','price']].values

In [234]:
X[:,0]

array(['green', 'red', 'blue'], dtype=object)

In [237]:
X[:,0].shape

(3,)

In [247]:
X[:,0].reshape(-1,1)

array([['green'],
       ['red'],
       ['blue']], dtype=object)

In [241]:
from sklearn.preprocessing import OneHotEncoder

X = df[['color', 'size', 'price']].values
color_ohe = OneHotEncoder()
color_ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [248]:
from sklearn.preprocessing import OneHotEncoder
X= df[['color','size','price']].values

ohe = OneHotEncoder()
d=ohe.fit_transform(X[:,0].reshape(-1,1)).toarray()

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
X= df[['color','size','price']].values

ct = ColumnTransformer([('onehot',OneHotEncoder(),[0]),
                          ('nothing','passthrough',[1,2])])

ct.fit_transform(X).astype(float)

array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

In [12]:
ct.get_feature_names_out()

array(['onehot__x0_blue', 'onehot__x0_green', 'onehot__x0_red',
       'nothing__x1', 'nothing__x2'], dtype=object)

In [16]:
##OHE via pandas

pd.get_dummies(df[['color','size','price']])

Unnamed: 0,size,price,color_blue,color_green,color_red
0,1,10.1,False,True,False
1,2,13.5,False,False,True
2,3,15.3,True,False,False


In [17]:
# multicollinearity guard in get_dummies

pd.get_dummies(df[['price', 'color', 'size']], drop_first=True)

Unnamed: 0,price,size,color_green,color_red
0,10.1,1,True,False
1,13.5,2,False,True
2,15.3,3,False,False


In [20]:
# multicollinearity guard for the OneHotEncoder

cohe = OneHotEncoder(categories='auto',drop='first')
ct = ColumnTransformer([('onehot',cohe,[0]),('nothing','passthrough',[1,2])])

ct.fit_transform(X).astype(float)

array([[ 1. ,  0. ,  1. , 10.1],
       [ 0. ,  1. ,  2. , 13.5],
       [ 0. ,  0. ,  3. , 15.3]])

In [21]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/'
                      'ml/machine-learning-databases/wine/wine.data',
                      header=None)

# if the Wine dataset is temporarily unavailable from the
# UCI machine learning repository, un-comment the following line
# of code to load the dataset from a local path:

# df_wine = pd.read_csv('wine.data', header=None)


df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium', 'Total phenols',
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
                   'Proline']

print('Class labels', np.unique(df_wine['Class label']))
df_wine.head()

Class labels [1 2 3]


Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [32]:
from sklearn.model_selection import train_test_split
X,y=df_wine.iloc[:,1:].values , df_wine.iloc[:,0].values
X_train,X_test,y_train,y_test =  train_test_split(X,y,random_state=1,test_size=0.3,stratify=y)


In [34]:
#Bringing features onto the same scale

from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
X_train_norm = mm.fit_transform(X_train)
X_test_norm = mm.transform(X_test)


In [37]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train_std = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [47]:
#L1 and L2 regularization as penalties against model complexity
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression(penalty='l1',C=1,multi_class='ovr')

In [66]:
import pyprind
pbar = pyprind.ProgBar(50000)
import pandas as pd
import os

In [68]:
base_path = 'aclImdb'
label = {'neg':0,'pos':1}
for l in ('test','train'):
    for s in ('neg','pos'):
        path = os.path.join(base_path,l,s)
        for f in path:
            with open(os.path.join(path,f),"r") as infile:
                txt = infile.read()
                df = df.append(txt,label[s])
        pbar.update()

df.columns= ['review','sentiment']


FileNotFoundError: [Errno 2] No such file or directory: 'aclImdb\\test\\neg\\a'

In [70]:
base_path

'aclImdb'

In [71]:
l

'test'

In [69]:
os.path.join(base_path,l,s)

'aclImdb\\test\\neg'

In [48]:
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))  ##shuffling the df

In [None]:
df.to_csv("movie_data.csv",index=False,encoding='utf-8')

In [None]:
df= pd.read_csv("movie_data.csv",encoding='utf-8')

In [51]:
##BOG
from sklearn.feature_extraction.text import CountVectorizer

cv= CountVectorizer()
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and one and one is two'])

bow = cv.fit_transform(docs)


In [52]:
cv.vocabulary_

{'the': 6,
 'sun': 4,
 'is': 1,
 'shining': 3,
 'weather': 8,
 'sweet': 5,
 'and': 0,
 'one': 2,
 'two': 7}

In [55]:
bow.toarray()

array([[0, 1, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 1, 0, 1],
       [2, 3, 2, 1, 1, 1, 2, 1, 1]], dtype=int64)

In [60]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf = True,
                            norm='l2',
                            smooth_idf=True)
vector = tfidf.fit_transform(cv.fit_transform(docs))


In [None]:
vector.toarray()

array([[0.        , 0.43370786, 0.        , 0.55847784, 0.55847784,
        0.        , 0.43370786, 0.        , 0.        ],
       [0.        , 0.43370786, 0.        , 0.        , 0.        ,
        0.55847784, 0.43370786, 0.        , 0.55847784],
       [0.50238645, 0.44507629, 0.50238645, 0.19103892, 0.19103892,
        0.19103892, 0.29671753, 0.25119322, 0.19103892]])

In [62]:
import re
def preprocessor():
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [64]:
from nltk.stem.porter import PorterStemmer
porter=PorterStemmer()
def tokenizer_stem(text):
    return[porter.stem(word) for word in text.split()]

In [65]:
tokenizer_stem("Welcome to rehabilation")

['welcom', 'to', 'rehabil']

In [136]:
a,y_hat,x={},{},{}
X=[None,1,2,3,4]
Y=[1,2,3,4,7]
loss=0
for t in range(len(X)):
        
        print(t)
        # Set x[t] to be the one-hot vector representation of the t'th character in X.
        # if X[t] == None, we just have x[t]=0. This is used to set the input for the first timestep to the zero vector. 
        x[t] = np.zeros((5,1)) 
        if (X[t] != None):
            x[t][X[t]] = 1
        
        # Run one step forward of the RNN
        #a[t], y_hat[t] = rnn_step_forward(parameters, a[t-1], x[t])
        y_hat[t]=np.array([[0.1],[0.2],[0.3],[0.4],[0.1],[0.5]])
        print(y_hat[t])
        
        # Update the loss by substracting the cross-entropy term of this time-step from it.
        loss -= np.log(y_hat[t][Y[t],0])
        

0
[[0.1]
 [0.2]
 [0.3]
 [0.4]
 [0.1]
 [0.5]]
1
[[0.1]
 [0.2]
 [0.3]
 [0.4]
 [0.1]
 [0.5]]
2
[[0.1]
 [0.2]
 [0.3]
 [0.4]
 [0.1]
 [0.5]]
3
[[0.1]
 [0.2]
 [0.3]
 [0.4]
 [0.1]
 [0.5]]
4
[[0.1]
 [0.2]
 [0.3]
 [0.4]
 [0.1]
 [0.5]]


IndexError: index 7 is out of bounds for axis 0 with size 6

In [122]:
y_hat={2:np.array([[1],[2],[3],[4]])}
y_hat[2][3,0]

4

In [128]:
c= np.array([2],[3],[4])

TypeError: array() takes from 1 to 2 positional arguments but 3 were given

In [121]:
c[0,2]

4

In [141]:
np.log(0.09)

-2.4079456086518722