In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Scenario
#### A popular social media platform for sharing photos and videos, has received complaints about fake user accounts. These fake accounts are said to have left spam comments on genuine user posts. Management has asked us to create a machine learning model that will help the platform distinguish real accounts from fake accounts. The company would then use the model to identify fake accounts so they can subsequently be deleted from the platform.
![FAKE SOCIAL MEDIA ACCOUNTS and IT’S CONCERN?](https://www.endnowfoundation.org/wp-content/uploads/elementor/thumbs/Detect-Fake-Profiles-on-Social-Media-p6yfct3ismgslao8tyklprwyrfd5tttfiwrd6xcjuw.jpg)
##### Image taken from: https://www.endnowfoundation.org/detect-fake-profiles-on-social-media-php/
##### Data about real and fake user accounts in social_media_train.csv.

# 1) Gather Data
#### The data is in the ocial_media_train.csv file. The target vector is given by the 'fake' column. Here the modules that typically is needed for reading and exploration is imported and then is read in pandas DataFrame df_train.

In [None]:
# Import modules 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Read data
df = pd.read_csv("/kaggle/input/social-media-train/social_media_train.csv", index_col=[0])
df.info()

# 2) Dataset Description 


In [None]:
data_dict = pd.read_csv('/kaggle/input/fake-account-data-dict/fake_account__data_dict.csv', index_col = 'No.')
data_dict

# 3) Goal of this project
##### The goal here is to predict whether a user account is fake or not. A problem of this nature is called a binary classification problem (binary since we have two categories). We use int numbers to specify the two categories. In the 'fake' column, a 1 represents that the account in that row is fake, while a 0 indicates a real account. 

# 4) Exploratory Data Analysis (EDA)
## Understand Data
#### It is necessary to familiarize with the data at the beginning so that we know later what to look for while cleaning and preparing the data.

In [None]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Suppress scientific notation
np.set_printoptions(suppress=True) 
pd.options.display.float_format = '{:.2f}'.format

# Display all columns
pd.set_option('display.max_columns', None)

# Check first five rows of data
df.head()

#### Each line of df represents a user or user account.

In [None]:
#check describe
df.describe()

In [None]:
# Determine categorical and numerical features

# Numerical columns
num_cols = ['ratio_numlen_username', 'len_fullname', 'ratio_numlen_fullname',
                'len_desc', 'num_posts', 'num_followers', 'num_following']
# Categorical columns
cat_cols = [col for col in df.columns.values.tolist() if col not in num_cols]
cat_cols

In [None]:
# Get an idea of target category: fake
plt.figure(figsize=(15,6))
fake_share = df["fake"].value_counts()
mylabel=["Not fake(0)","fake(1)"]
colors = ['#99ff99','#ff9999']
plt.pie(fake_share,
        labels=mylabel,autopct="%1.1f%%",colors=colors,
        textprops={'fontsize': 16})
plt.axis("equal");

#### Target category is strongly balanced. 

In [None]:
# Check the percentage of the missing values

percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'percent_missing (%)': percent_missing})
missing_value_df.sort_values('percent_missing (%)', ascending=False)

In [None]:
# Check data correlation: Sort most highly correlated values
display(df.corr()['fake'].sort_values())

# Correlation heatmap
# Colormap: Most negative correlations (dark-blue) to most positive correlation (dark red)
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

 #### Just like linear regression, logistic regression makes a number of assumptions. For continuous data, the following are relevant:
#### *  The features should not be strongly correlated with each other. 
#### *  There should be a linear relationship between the features and the sigmoid transformed probabilities.

#### As can be seen all correlation values between features relatively close to 0, rark blue (First criteria above fulfilled). Number of characters in the account description (len_desc)and Ratio of numeric characters in the account useername (ratio_numlen_username) shows most positive and negative correlation with fake status.

In [None]:
# Categorical data
display(df.loc[:, cat_cols].head(10))
print('----------------------')

# Unique values
for col in cat_cols:
    unique_values = df.loc[:, col].unique()
    print("\nColumn name: {}\nUnique values: {}".format(col, unique_values))  

# 5) One-hot encoding
#### Most machine learning models can only deal with numeric features. As like in this case for categorical columns above, many important real-world features are not numeric but rather categorical. 
#### The categorical features need to be transformed into numerical features. While numerous techniques exist to transform these features, the most common technique is one-hot encoding. 1-of-n encoding uses pdp.OneHotEncode() to create a set of new 0/1 features from a categorical feature with more than two categories

In [None]:
import pdpipe as pdp

# Label encoding for categorical with two unique values
dict_label_encoding = {'Yes': 1, 'No': 0}
df.loc[:, 'profile_pic'] = df.loc[:, 'profile_pic'].replace(dict_label_encoding)
df.loc[:, 'extern_url'] = df.loc[:, 'extern_url'].replace(dict_label_encoding)
df.loc[:, 'private'] = df.loc[:, 'private'].replace(dict_label_encoding)


# one-hot encoding 
onehot = pdp.OneHotEncode(["sim_name_username"], drop_first=False) 

#fit and transform on train set
df = onehot.fit_transform(df) 

In [None]:
# Overview of train set
df

# 6) Logistic regression without regularization
#### we can now set up a logistic regression model, see Linear regression versus logistic regression. By default, sklearn's logistic regression algorithm already uses regularization <a href="https://www.simplilearn.com/tutorials/machine-learning-tutorial/regularization-in-machine-learning#:~:text=REGISTER%20NOW-,What%20is%20Regularization%20in%20Machine%20Learning%3F,-Regularization%20refers%20to">regularization</a>, by default with the regularization parameter C=1.0. If we assign an extremely large value to C, such as a 1 followed by 42 zeros (1e42), no regularization is performed. That's what we want to achieve here first.

#### Unfortunately, the algorithm needs many attempts to solve the problem. The default 100 iterations are not enough. Therefore, we should also assign a relatively large number to max_iter. This parameter sets the maximum number of iterations the solvers need to converge. 10000 (1e4) should suffice here.


In [None]:
# Import logistic regression library
from sklearn.linear_model import LogisticRegression

# Initiate model
model_logreg = LogisticRegression(solver='lbfgs', max_iter=1e4, C=1e42, random_state=42)

# Define feature and target values
feature_train = df.drop(df['fake'])
target_train = df['fake']

