## Linear Regression

In [None]:
import numpy as np
from sklearn import datasets

#np.set_printoptions(precision=2)

N = 10
X, y = datasets.make_regression(n_samples=N, n_features=1, n_targets=1, noise=20, random_state=0)
print('X=', X)
print('X.shape=', X.shape)
print('y=', y)

### scatter plot

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.scatter(X, y)
plt.xlabel('x', fontsize=16)
plt.ylabel('y', fontsize=16)

### construct training and test datasets

In [None]:
N = 50
X, y = datasets.make_regression(n_samples=N, n_features=1, n_targets=1, noise=10, random_state=0)


plt.figure(figsize=(15, 3))

plt.subplot(1, 3, 1)
plt.scatter(X, y)
plt.title('Whole data')

train_size = int(N*.9)

# construct training data // split the first slice
X_tr = X[:train_size]
y_tr = y[:train_size]

# construct testing data // using the remaining slice
X_te = X[train_size:]
y_te = y[train_size:]

plt.subplot(1, 3, 2)
plt.scatter(X_tr, y_tr)
plt.title('Training data')

plt.subplot(1, 3, 3)
plt.scatter(X_te, y_te)
plt.title('Test data')

if 1:
    plt.scatter(X_tr, y_tr, c='lightgray')

if 0:
    x_max = X.max()
    x_min = X.min()
    y_max = y.max()
    y_min = y.min()

    plt.xlim([x_min, x_max])
    plt.ylim([y_min, y_max])



### fit the training data to a linear model

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_tr, y_tr)


In [None]:
print(model.coef_)
print(model.intercept_)

In [None]:
plt.scatter(X_tr, y_tr)

plt.plot(X, X*model.coef_ + model.intercept_, '--', c='red')


### make predictions using the above model

In [None]:
y_pred = model.predict(X_te)

print('y_pred =  ', y_pred)
print('y_acutal =', y_te)

### plot prediction vs. actual value

In [None]:
plt.figure(figsize=(15, 3))

plt.subplot(1, 3, 1)
plt.scatter(X_tr, y_tr, c='lightgray')
plt.scatter(X_te, y_te)
plt.title('Test data')


plt.subplot(1, 3, 2)
plt.scatter(X_tr, y_tr, c='lightgray')

plt.scatter(X_te, y_te)

plt.scatter(X_te, y_pred, c='red')

plt.title('Prediction')


plt.subplot(1, 3, 3)
plt.scatter(X_tr, y_tr, c='lightgray')

plt.scatter(X_te, y_te, s=[200]*len(y_te), alpha=0.5)

plt.scatter(X_te, y_pred, c='red')
# plt.scatter(X_te, y_pred, marker='.', c='red')

plt.title('Prediction')



## Classification

In [None]:
import matplotlib.pyplot as plt

loaded_data = datasets.load_iris()
X = loaded_data.data
y = loaded_data.target

print('First 10 data points of X =\n', X[:10,:])
print('y =', y)

In [None]:
plt.scatter(X[:,0], X[:,2], c=y)
X = X[:, [0, 2]]
X.shape

### Gaussian Naive Bayes model

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

In [None]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1, random_state=0)

print(X_tr.shape)
print(X_te.shape)

model = GaussianNB()

model.fit(X_tr, y_tr)

y_pred = model.predict(X_te)

print(y_pred)
print(y_te)

plt.figure(figsize=(18, 5))

plt.subplot(1, 2, 1)

plt.scatter(X_tr[:,0], X_tr[:,1], c=y_tr, s=[10]*len(y_tr))
plt.scatter(X_te[:,0], X_te[:,1], c=y_te, marker='o', s=[250]*len(y_te), alpha=0.2)

plt.subplot(1, 2, 2)
plt.scatter(X_tr[:,0], X_tr[:,1], c=y_tr, s=[10]*len(y_tr))
plt.scatter(X_te[:,0], X_te[:,1], c=y_te, marker='o', s=[250]*len(y_te), alpha=0.2)
plt.scatter(X_te[:,0], X_te[:,1], c=y_pred, marker='*',  s=[100]*len(y_te))


### Support Vector Model

In [None]:
from sklearn.svm import SVC

In [None]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1, random_state=0)

print(X_tr.shape)
print(X_te.shape)

model = SVC()

model.fit(X_tr, y_tr)

y_pred = model.predict(X_te)

print(y_pred)
print(y_te)

plt.figure(figsize=(18, 5))

plt.subplot(1, 2, 1)

plt.scatter(X_tr[:,0], X_tr[:,1], c=y_tr, s=[10]*len(y_tr))
plt.scatter(X_te[:,0], X_te[:,1], c=y_te, marker='o', s=[250]*len(y_te), alpha=0.2)

plt.subplot(1, 2, 2)
plt.scatter(X_tr[:,0], X_tr[:,1], c=y_tr, s=[10]*len(y_tr))
plt.scatter(X_te[:,0], X_te[:,1], c=y_te, marker='o', s=[250]*len(y_te), alpha=0.2)
plt.scatter(X_te[:,0], X_te[:,1], c=y_pred, marker='*',  s=[100]*len(y_te))

## Text Classification

In [None]:
import pandas as pd
pd.options.display.max_colwidth = 500

neg = open('rt-polarity.neg', encoding='latin-1').readlines()
pos = open('rt-polarity.pos', encoding='latin-1').readlines()
data = {'text': neg + pos, 'label': [0]*len(neg) + [1]*len(pos)}
df = pd.DataFrame(data)
df.head()

In [None]:
df.label.value_counts()

In [None]:
df['text'] = df['text'].apply(lambda x: x.strip())
df.head()

In [None]:
df = df.sample(frac=1, random_state=0)
df.head()

In [None]:
train_size = 1000
df_tr = df[:train_size]
df_te = df[train_size:]

print(df_tr.label.value_counts())
print(df_te.label.value_counts())

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(min_df=15, token_pattern='[^ ]+')
X_tr = count_vect.fit_transform(df_tr.text)
X_tr.shape

In [None]:
count_vect.vocabulary_

In [None]:
X_tr.toarray()[:2, :]

In [None]:
df_tr[:3]

In [None]:
X_te = count_vect.transform(df_te.text)
X_te.shape

In [None]:
from sklearn.svm import LinearSVC
model = LinearSVC()
model.fit(X_tr, df_tr.label)

In [None]:
y_pred = model.predict(X_te)

In [None]:
y_pred[:10]

In [None]:
df_te.label[:10]

In [None]:
import numpy as np
np.mean(y_pred == df_te.label)