In [2]:
import numpy as np
import pandas as pd

In [8]:
df = pd.DataFrame(data={"col1": [0, 1], "col2": pd.Series([1], index=[1])}, index=[0, 1])

In [10]:
type(df['col1'])

pandas.core.series.Series

In [11]:
type(df[['col1']])

pandas.core.frame.DataFrame

In [13]:
df[['col2']]

Unnamed: 0,col2
0,
1,1.0


## Count Vectorize Example

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer

# Loading in the data
tweets_df = pd.read_csv('data/balanced_tweets.csv').dropna(subset=['target'])

# Split the dataset into the feature table `X` and the target value `y`
X = tweets_df['text']
y = tweets_df['target']

# Split the dataset into X_train, X_test, y_train, y_test 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=7)


param_grid = {
    "countvectorizer__max_features": range(1,1000)
}

# Make a pipeline with CountVectorizer as the first step and SVC as the second 
pipe = make_pipeline(CountVectorizer(), SVC())

# perform RandomizedSearchCV using the parameters specified in param_grid
# Don't forget to fit this on the training data
tweet_search = RandomizedSearchCV(pipe, param_grid, n_jobs=-1, cv=5,
                                  return_train_score=True, n_iter=10,
                                   random_state=2020)
tweet_search.fit(X_train, y_train)

# What is the best max_features value? Save it in an object name tweet_feats
tweet_feats = tweet_search.best_params_['countvectorizer__max_features']
print(tweet_feats)

# What is the best score? Save it in an object named tweet_val_score
tweet_val_score = tweet_search.best_score_
print(tweet_val_score)

# Score the optimal model on the test set and save it in an object named tweet_test_score
tweet_test_score = tweet_search.score(X_test, y_test)
print(tweet_test_score)

## Multiclass Logistic Regression

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix

# Loading in the data
bball = pd.read_csv('data/bball.csv')
bball = bball[(bball['draft_year'] != 'Undrafted') & (bball['draft_round'] != 'Undrafted') & (bball['draft_peak'] != 'Undrafted')]
bball = bball.replace({'F-G': 'Other', 'F-C': 'Other', 'G-F': 'Other', 'C-F': 'Other', 'C': 'Other'})
df_train, df_test = train_test_split(bball, test_size=0.2, random_state=1)


X_train = df_train[["weight", "height", "draft_year", "draft_round",
                     "draft_peak", "team", "salary", "country"]]
X_test = df_test[["weight", "height", "draft_year", "draft_round",
                     "draft_peak", "team", "salary", "country"]]
y_train = df_train['position']
y_test = df_test['position']


# Split the numeric and categorical features 
numeric_features = [ "weight",
                     "height",
                     "draft_year",
                     "draft_round",
                     "draft_peak"]

categorical_features = ["team", "country"]


# Build a numeric pipeline
numeric_transformer = make_pipeline(
    SimpleImputer(strategy="median"))

# Build a categorical pipeline
categorical_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))


# Build a numeric pipeline
numeric_transformer = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler())

# Build a categorical pipeline
categorical_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

# Build a categorical transformer
col_transformer = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features))

# Build a main pipeline
lr_pipe = make_pipeline(col_transformer, LogisticRegression())

# Fit your pipeline on the training set
lr_pipe.fit(X_train, y_train)

# Plot your confusion matrix on your test set 
plot_confusion_matrix(lr_pipe, X_test, y_test)