In [1]:
import pandas as pd 

In [17]:
train_data = pd.read_csv('../Data/Processed Dataset TF-IDF Train')
test_data = pd.read_csv('../Data/Processed Dataset TF-IDF Test')

In [18]:
train_data = train_data.drop('time of day', axis = 1)
train_data = train_data.drop('id', axis = 1)
train_data = train_data.drop('date_x', axis = 1)
train_data = train_data.drop('user', axis = 1)

test_data = test_data.drop('time of day', axis = 1)
test_data = test_data.drop('id', axis = 1)
test_data = test_data.drop('date_x', axis = 1)
test_data = test_data.drop('user', axis = 1)

In [24]:
train_data = train_data.drop('rendered_content', axis = 1)

In [25]:
train_data = train_data.drop('Adjusted Tweet', axis = 1)

In [26]:
test_data = test_data.drop('rendered_content', axis = 1)

In [27]:
test_data = test_data.drop('Adjusted Tweet', axis = 1)

In [8]:
from typing import Optional, Union
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.base import BaseEstimator
from sklearn.utils.estimator_checks import check_estimator
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
#from lightgbm import LGBMClassifier, LGBMRegressor


class HurdleRegression(BaseEstimator):
    """ Regression model which handles excessive zeros by fitting a two-part model and combining predictions:
            1) binary classifier
            2) continuous regression
    Implementeted as a valid sklearn estimator, so it can be used in pipelines and GridSearch objects.
    Args:
        clf_name: currently supports either 'logistic' or 'LGBMClassifier'
        reg_name: currently supports either 'linear' or 'LGBMRegressor'
        clf_params: dict of parameters to pass to classifier sub-model when initialized
        reg_params: dict of parameters to pass to regression sub-model when initialized
    """

    def __init__(self,
                 clf_name: str = 'logistic',
                 reg_name: str = 'linear',
                 clf_params: Optional[dict] = None,
                 reg_params: Optional[dict] = None):

        self.clf_name = clf_name
        self.reg_name = reg_name
        self.clf_params = clf_params
        self.reg_params = reg_params

    @staticmethod
    def _resolve_estimator(func_name: str):
        """ Lookup table for supported estimators.
        This is necessary because sklearn estimator default arguments
        must pass equality test, and instantiated sub-estimators are not equal. """

        funcs = {'linear': LinearRegression(),
                 'logistic': LogisticRegression(solver='liblinear'),
                 #'LGBMRegressor': LGBMRegressor(n_estimators=50),
                 #'LGBMClassifier': LGBMClassifier(n_estimators=50)
                }

        return funcs[func_name]

    def fit(self,
            X: Union[np.ndarray, pd.DataFrame],
            y: Union[np.ndarray, pd.Series]):
        X, y = check_X_y(X, y, dtype=None,
                         accept_sparse=False,
                         accept_large_sparse=False,
                         force_all_finite='allow-nan')

        if X.shape[1] < 2:
            raise ValueError('Cannot fit model when n_features = 1')

        self.clf_ = self._resolve_estimator(self.clf_name)
        if self.clf_params:
            self.clf_.set_params(**self.clf_params)
        self.clf_.fit(X, y > 0)

        self.reg_ = self._resolve_estimator(self.reg_name)
        if self.reg_params:
            self.reg_.set_params(**self.reg_params)
        self.reg_.fit(X[y > 0], y[y > 0])

        self.is_fitted_ = True
        return self

    def predict(self, X: Union[np.ndarray, pd.DataFrame]):
        """ Predict combined response using binary classification outcome """
        X = check_array(X, accept_sparse=False, accept_large_sparse=False)
        check_is_fitted(self, 'is_fitted_')
        return self.clf_.predict(X) * self.reg_.predict(X)

    def predict_expected_value(self, X: Union[np.ndarray, pd.DataFrame]):
        """ Predict combined response using probabilistic classification outcome """
        X = check_array(X, accept_sparse=False, accept_large_sparse=False)
        check_is_fitted(self, 'is_fitted_')
        return self.clf_.predict_proba(X)[:, 1] * self.reg_.predict(X)

'''
def manual_test():
    """ Validate estimator using sklearn's provided utility and ensure it can fit and predict on fake dataset. """
    check_estimator(HurdleRegression())
    from sklearn.datasets import make_regression
    X, y = make_regression()
    reg = HurdleRegression()
    reg.fit(X, y)
    reg.predict(X)


if __name__ == '__main__':
    manual_test()
'''

'\ndef manual_test():\n    """ Validate estimator using sklearn\'s provided utility and ensure it can fit and predict on fake dataset. """\n    check_estimator(HurdleRegression())\n    from sklearn.datasets import make_regression\n    X, y = make_regression()\n    reg = HurdleRegression()\n    reg.fit(X, y)\n    reg.predict(X)\n\n\nif __name__ == \'__main__\':\n    manual_test()\n'

In [28]:

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LogisticRegression


# split data into features and target
X_train = train_data.drop(columns=['retweets'])
y_train = train_data['retweets']
X_test = test_data.drop(columns=['retweets'])
y_test = test_data['retweets']

# initialize HurdleRegression model
clf_name = 'logistic'
reg_name = 'linear'

model = HurdleRegression(clf_name=clf_name, reg_name=reg_name)

# fit model on training data
model.fit(X_train, y_train)

# make predictions on test data
y_pred = model.predict(X_test)

# evaluate predictions using root mean squared error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)

RMSE: 15.532999069422997
