# Capstone 4

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.formula.api as smf
import sklearn.metrics
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

Topic: Speaking as a DA from a business that wants to perform better compared to others in the same country, and uncover what they're doing better that we can potentially adopt. I'd also like to make a prediction on products sold based on seller gender, country, # of followers, etc!

# Loading datasets

In [7]:
# Countries in each dataset written in French. When inspecting on Excel, noticed encryption/accents. Saving to UTF-8,
# and using IFERROR + VLOOKUP to translate country names

In [2]:
# Dealing with encrypted and accented characters:
# Open Excel
# Click “File” and “New”
# Click on the “Data” tab
# Click “From Text” and select the CSV file
# Select “Delimited”
# For “File origin”, select “65001 : Unicode (UTF-8)”
# Click “Next”
# Select “Comma”
# Click “Finish”
# Excel should now show you the CSV file and display the characters correctly.

In [8]:
# Loading revised datasets

In [9]:
# Dataset of users and information about their social shopping/behavior

users = pd.read_csv('~/Documents/JONATHANAYALA-CAPSTONE4/DATA/users.csv', encoding='utf8')

users.head(5)

Unnamed: 0,identifierHash,type,country,eng_country,language,socialNbFollowers,socialNbFollows,socialProductsLiked,productsListed,productsSold,...,civilityTitle,hasAnyApp,hasAndroidApp,hasIosApp,hasProfilePicture,daysSinceLastLogin,seniority,seniorityAsMonths,seniorityAsYears,countryCode
0,-1.0979e+18,user,Royaume-Uni,United Kingdom,en,147,10,77,26,174,...,mr,True,False,True,True,11,3196,106.53,8.88,gb
1,2.34757e+18,user,Monaco,Monaco,en,167,8,2,19,170,...,mrs,True,False,True,True,12,3204,106.8,8.9,mc
2,6.87094e+18,user,France,France,fr,137,13,60,33,163,...,mrs,True,False,True,False,11,3203,106.77,8.9,fr
3,-4.64027e+18,user,Etats-Unis,United States,en,131,10,14,122,152,...,mrs,True,False,True,False,12,3198,106.6,8.88,us
4,-5.17583e+18,user,Etats-Unis,United States,en,167,8,0,25,125,...,mrs,False,False,False,True,22,2854,95.13,7.93,us


In [10]:
# Comparison of seller accounts by country and gender

sellers = pd.read_csv('~/Documents/JONATHANAYALA-CAPSTONE4/DATA/sellers.csv', encoding='utf8')

sellers.head(5)

Unnamed: 0,country,eng_country,sex,nbsellers,meanproductssold,meanproductslisted,meansellerpassrate,totalproductssold,totalproductslisted,meanproductsbought,meanproductswished,meanproductsliked,totalbought,totalwished,totalproductsliked,meanfollowers,meanfollows,percentofappusers,percentofiosusers,meanseniority
0,Allemagne,Germany,Female,116,4.03,2.72,27.33,468,315,3.05,34.66,35.28,354,4021,4092,9.5,8.9,54.0,49.0,3060.336207
1,Allemagne,Germany,Male,34,2.0,1.0,19.15,68,34,1.62,3.38,31.79,55,115,1081,7.8,8.4,79.0,64.0,3089.058824
2,Arménie,Armenia,Female,1,0.0,25.0,0.0,0,25,0.0,0.0,1.0,0,0,1,4.0,8.0,,,3201.0
3,Australie,Australia,Female,18,0.94,1.33,10.44,17,24,6.11,17.72,209.28,110,319,3767,7.5,9.3,55.0,55.0,3103.666667
4,Australie,Australia,Male,3,6.0,4.0,33.33,18,12,8.0,24.0,38.33,24,72,115,12.7,8.3,66.0,66.0,3085.666667


In [11]:
# Stats about countries with top sellers. Top sellers those with >= 20 products sold
# They represent ~ the top 10% of all seller accounts in the dataset
# (Seller accounts are those who successfully sold at least 1 product)
# These are just insights extracted from the main file!

topsellers = pd.read_csv('~/Documents/JONATHANAYALA-CAPSTONE4/DATA/topsellers.csv', encoding='utf8')

topsellers.head(5)

Unnamed: 0,country,eng_country,sellers,topsellers,topsellerratio,femalesellersratio,topfemalesellersratio,femalesellers,malesellers,topfemalesellers,...,topmeanproductssold,topmeanproductslisted,meanproductssold,meanproductslisted,meanofflinedays,topmeanofflinedays,meanfollowers,meanfollowing,topmeanfollowers,topmeanfollowing
0,Taiwan,Taiwan,1,1,100.0,100.0,100.0,1,0,1,...,57.0,56.0,57.0,56.0,11.0,11.0,83.0,8.0,83.0,8.0
1,Slovaquie,Slovakia,2,1,50.0,0.0,0.0,0,2,0,...,27.0,14.0,14.0,7.0,17.0,15.0,10.5,8.5,15.0,8.0
2,Lettonie,Latvia,4,2,50.0,100.0,100.0,4,0,2,...,40.5,18.0,20.75,9.0,120.3,11.5,21.0,52.3,38.0,98.5
3,Bulgarie,Bulgaria,9,4,44.4,66.7,100.0,6,3,4,...,36.25,17.25,18.888889,9.111111,98.3,19.0,28.6,31.6,46.3,19.0
4,Chypre,Cyprus,4,1,25.0,100.0,100.0,4,0,1,...,41.0,66.0,14.0,20.25,17.3,11.0,21.3,10.3,39.0,17.0


# EDA

In [12]:
# users

In [13]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98913 entries, 0 to 98912
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   identifierHash       98913 non-null  float64
 1   type                 98913 non-null  object 
 2   country              98913 non-null  object 
 3   eng_country          98913 non-null  object 
 4   language             98913 non-null  object 
 5   socialNbFollowers    98913 non-null  int64  
 6   socialNbFollows      98913 non-null  int64  
 7   socialProductsLiked  98913 non-null  int64  
 8   productsListed       98913 non-null  int64  
 9   productsSold         98913 non-null  int64  
 10  productsPassRate     98913 non-null  float64
 11  productsWished       98913 non-null  int64  
 12  productsBought       98913 non-null  int64  
 13  gender               98913 non-null  object 
 14  civilityGenderId     98913 non-null  int64  
 15  civilityTitle        98913 non-null 

In [14]:
users.describe()

Unnamed: 0,identifierHash,socialNbFollowers,socialNbFollows,socialProductsLiked,productsListed,productsSold,productsPassRate,productsWished,productsBought,civilityGenderId,daysSinceLastLogin,seniority,seniorityAsMonths,seniorityAsYears
count,98913.0,98913.0,98913.0,98913.0,98913.0,98913.0,98913.0,98913.0,98913.0,98913.0,98913.0,98913.0,98913.0,98913.0
mean,-6692037000000000.0,3.432269,8.425677,4.420743,0.093304,0.121592,0.812303,1.562595,0.171929,1.773993,581.291236,3063.77187,102.125583,8.510424
std,5.330807e+18,3.882383,52.839572,181.030569,2.050144,2.126895,8.500205,25.192793,2.332266,0.428679,208.855888,168.298621,5.609735,0.467863
min,-9.2231e+18,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,11.0,2852.0,95.07,7.92
25%,-4.62289e+18,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,572.0,2857.0,95.23,7.94
50%,-1337990000000000.0,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,694.0,3196.0,106.53,8.88
75%,4.61639e+18,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,702.0,3201.0,106.7,8.89
max,9.22333e+18,744.0,13764.0,51671.0,244.0,174.0,100.0,2635.0,405.0,3.0,709.0,3205.0,106.83,8.9


In [15]:
# sellers

In [16]:
sellers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   country              70 non-null     object 
 1   eng_country          70 non-null     object 
 2   sex                  72 non-null     object 
 3   nbsellers            73 non-null     int64  
 4   meanproductssold     73 non-null     float64
 5   meanproductslisted   73 non-null     float64
 6   meansellerpassrate   73 non-null     float64
 7   totalproductssold    73 non-null     int64  
 8   totalproductslisted  73 non-null     int64  
 9   meanproductsbought   73 non-null     float64
 10  meanproductswished   73 non-null     float64
 11  meanproductsliked    73 non-null     float64
 12  totalbought          73 non-null     int64  
 13  totalwished          73 non-null     int64  
 14  totalproductsliked   73 non-null     int64  
 15  meanfollowers        73 non-null     float

In [17]:
sellers.describe()

Unnamed: 0,nbsellers,meanproductssold,meanproductslisted,meansellerpassrate,totalproductssold,totalproductslisted,meanproductsbought,meanproductswished,meanproductsliked,totalbought,totalwished,totalproductsliked,meanfollowers,meanfollows,percentofappusers,percentofiosusers,meanseniority
count,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,66.0,65.0,73.0
mean,111.739726,4.98411,4.177671,26.382329,494.260274,379.273973,3.266575,40.527397,72.186986,225.616438,1874.424658,6601.931507,12.583562,16.475342,68.015152,61.492308,3070.617187
std,412.253126,8.091924,7.484078,21.453931,1809.983142,1410.134078,5.21188,113.55102,134.051376,824.771679,7067.534787,26250.135826,12.190946,25.093557,19.716836,22.041242,100.929973
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,8.0,20.0,12.0,2855.0
25%,2.0,1.2,1.0,0.0,3.0,3.0,0.0,1.13,6.33,0.0,4.0,15.0,7.5,8.2,57.5,50.0,3043.909091
50%,5.0,2.79,2.25,29.46,24.0,14.0,1.29,7.0,27.75,15.0,80.0,333.0,9.5,9.0,66.0,57.0,3084.6
75%,22.0,5.25,4.15,37.0,107.0,82.0,3.33,26.4,69.99,77.0,817.0,1081.0,13.4,12.3,75.0,66.0,3130.6
max,2719.0,57.0,56.0,98.0,12027.0,9229.0,24.67,707.0,792.0,5490.0,45611.0,160647.0,83.0,157.0,100.0,100.0,3204.0


In [18]:
# topsellers

In [19]:
topsellers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   country                 19 non-null     object 
 1   eng_country             19 non-null     object 
 2   sellers                 19 non-null     int64  
 3   topsellers              19 non-null     int64  
 4   topsellerratio          19 non-null     float64
 5   femalesellersratio      19 non-null     float64
 6   topfemalesellersratio   19 non-null     float64
 7   femalesellers           19 non-null     int64  
 8   malesellers             19 non-null     int64  
 9   topfemalesellers        19 non-null     int64  
 10  topmalesellers          19 non-null     int64  
 11  countrysoldratio        19 non-null     float64
 12  bestsoldratio           18 non-null     float64
 13  toptotalproductssold    19 non-null     int64  
 14  totalproductssold       19 non-null     int6

In [20]:
topsellers.describe()

Unnamed: 0,sellers,topsellers,topsellerratio,femalesellersratio,topfemalesellersratio,femalesellers,malesellers,topfemalesellers,topmalesellers,countrysoldratio,...,topmeanproductssold,topmeanproductslisted,meanproductssold,meanproductslisted,meanofflinedays,topmeanofflinedays,meanfollowers,meanfollowing,topmeanfollowers,topmeanfollowing
count,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,...,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0
mean,99.578947,6.631579,20.968421,78.778947,80.963158,75.368421,24.210526,5.105263,1.526316,2.032632,...,50.026688,30.124714,12.966926,8.582865,122.631579,23.757895,19.752632,17.389474,55.310526,56.389474
std,178.94857,9.80482,24.430174,22.167182,26.052025,131.976097,47.390317,7.194865,2.874571,1.616621,...,34.374679,19.263995,13.5028,12.208144,73.15394,20.889453,17.128448,14.551552,36.883012,103.303425
min,1.0,1.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.69,...,20.0,0.0,3.722222,1.142857,11.0,11.0,7.0,8.0,15.0,3.0
25%,6.0,1.0,6.45,74.3,71.8,5.5,0.5,1.0,0.0,1.23,...,28.5,18.5,5.167766,2.930594,62.6,11.0,10.8,8.7,36.65,8.55
50%,13.0,2.0,10.1,78.7,85.7,10.0,3.0,1.0,1.0,1.59,...,41.0,21.380952,8.126801,5.5,121.6,12.7,15.1,10.2,46.3,14.5
75%,112.0,5.5,22.5,92.85,100.0,88.0,24.0,4.5,1.0,2.015,...,56.7,39.5,14.0,7.109375,172.8,28.5,20.4,19.75,58.3,36.5
max,713.0,35.0,100.0,100.0,100.0,523.0,190.0,23.0,12.0,7.31,...,170.0,71.333333,57.0,56.0,266.1,79.4,83.0,54.6,167.0,429.3
