In [1]:
#imports to prepare environment

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Statistical Tests
import scipy.stats as stats

# Visualizing
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
from sklearn.model_selection import learning_curve

pd.options.display.float_format = '{:20,.2f}'.format


# GSS 2021 SURVEY: 
## <i> CAN PARTICIPANT RESPONSES ACTUALLY SHOW WHAT ZODIAC SIGN A PERSON IS?
    
> Introduction: 
    Since 1972, the General Social Survey (GSS) has been conducted annually in the United States of contemporary American Society--through the lens of adult survey participants.
    
    The GSSurvey asks US adults questions related to current demographics, opinions, religious/spiritual beliefs, political and life view points, social class opinions, as well even recording peoples zodiac signs.
    
    As many viewpoints on astrology overall seem to be changing, oftentimes in terms of self-identity and decision factors, this project is taking a look at astrology/zodiac signs to determine if the responses of the US adult partipants of the GSS actually correlates to the astrological signs they provided. 
    
    

In [2]:
#reading in the GSS 2021 data using stata file
df = pd.read_stata('gss2021.dta')

In [3]:
df.head()

Unnamed: 0,year,id,wrkstat,wrkslf,wrkgovt,occ10,prestg10,indus10,marital,martype,...,relitennv,biblenv,postlifenv,kidssolnv,uscitznnv,fucitznnv,fepolnv,scibnftsnv,abanyg,fileversion
0,2021,1,working full time,someone else,,receptionists and information clerks,38.0,offices of dentists,married,,...,not very strong,inspired word,no,somewhat worse,,,,2.0,no,7221.2
1,2021,2,working full time,someone else,,advertising and promotions managers,57.0,advertising and related services,divorced,,...,no religion,,,,,,,,,7221.2
2,2021,3,working part time,someone else,,miscellaneous assemblers and fabricators,35.0,furniture and home furnishings stores,never married,,...,,,,,,,disagree,,yes,7221.2
3,2021,4,working part time,self-employed,,childcare workers,35.0,child day care services,widowed,,...,,,,,,,,,,7221.2
4,2021,6,working full time,someone else,,insurance claims and policy processing clerks,38.0,insurance carriers and related activities,never married,,...,not very strong,inspired word,yes,,a u.s. citizen,,disagree,,yes,7221.2


### Notes: I went through the survey and found that these will be the columns I am wanting to keep:
> #### Target Variable: ZODIAC

Demographic Variables:
 - born, race, ethnic, age, sex, sexbirth, sexnow, sexornt, hompop, marital, martype

Growing up Variables:
 - paocc10, maocc10, res16, reg16, reli16

Education variables:
- edu, degree, major1, coldeg1

Career Variables:
- hrs1, hrs2, income, wrkslf, indust, satjob, isco08, occ10

Politics Variables:
- partyid, if16who, polviews, gunlaw, grassv

Religious and Spirituality variables:
- relidesc, relig, cofund, postlifev, postlifenv, sprtprsn, relexp, sprtconnct, sprtlrgr, sprtpurp

Life View variables:
- happy, happycohab, life, obey, popular, thnkself, workhard, helpoth, grtwrks, freemind, decevidc, advfmsci

Health variables:
- mditate1, health, hlthphys, hlthmntl, enjoynat, plantrp, eatmeat, recycle, nobuygrn

View of others:
- fairv, fairnv, helpfulv, helpfulnv, trustv, trustnv, conmedic, contv, conpress, consci, conjudge, conmilitary, conlegis

Social variables:
- socbar, socrel, socommun, socfrend, satsoc, class, satfin, quallife, partners, partnrs5


### My analysis is to see if the above survey responses to the questions can actually closely predict what zodiac sign each respondant is. 

I wanted to try this out as I would say that I am moderately unconvinced that Astrology can be a classifier for a person, but I think it would be interesting to test this theory. 

In [4]:
#pulling out only the columns I want to use for this exploration:
df  = df[['zodiac','born', 'race', 
          'ethnic', 'age', 'sex', 'sexornt', 
          'marital', 'martype','paocc10', 'maocc10', 'res16', 
          'reg16', 'degree',  
          'income', 'wrkslf', 'satjob', 'occ10','partyid', 
          'if16who', 'polviews', 'gunlaw', 'grassv','relidesc', 'relig', 
          'postlifev', 'postlifenv', 'sprtprsn', 'sprtconnct', 'sprtlrgr', 
          'sprtpurp','happy', 'life', 'obey', 'popular', 'thnkself', 
          'workhard', 'helpoth', 'grtwrks', 'freemind', 'decevidc', 'advfmsci',
         'mditate1', 'health', 'hlthphys', 'hlthmntl', 'enjoynat',  
          'eatmeat', 'recycle', 'nobuygrn','fairv', 'fairnv', 'helpfulv', 
          'helpfulnv', 'trustv', 'trustnv', 'conmedic', 'contv', 'conpress', 
          'consci', 'conjudge', 'conlegis', 'socbar', 
          'socrel', 'socommun', 'socfrend', 'satsoc', 'class', 'satfin', 
          'quallife', 'partners', 'partnrs5'
        ]]

In [5]:
#taking a look
df.head(2)

Unnamed: 0,zodiac,born,race,ethnic,age,sex,sexornt,marital,martype,paocc10,...,socbar,socrel,socommun,socfrend,satsoc,class,satfin,quallife,partners,partnrs5
0,pisces,yes,white,italy,65.0,female,heterosexual or straight,married,,"dredge, excavating, and loading machine operators",...,,,,,fair,middle class,more or less satisfied,good,1 partner,1 partner
1,pisces,yes,white,england and wales,60.0,male,,divorced,,chief executives,...,several times a month,several times a year,once or twice a week,several times a month,very good,upper class,pretty well satisfied,excellent,,


In [6]:
#finding the size of data left

df.shape

(4032, 72)

In [7]:
#wonder what dtypes I will be working with:
df.dtypes

zodiac      category
born        category
race        category
ethnic      category
age         category
              ...   
class       category
satfin      category
quallife    category
partners    category
partnrs5    category
Length: 72, dtype: object

In [8]:
#count of the target variable
df.zodiac.value_counts()

capricorn      390
scorpio        346
sagittarius    339
aquarius       330
virgo          327
libra          319
leo            305
taurus         281
pisces         279
cancer         270
gemini         253
aries          237
Name: zodiac, dtype: int64

In [9]:
#finding nulls in target variable...
df.zodiac.isnull().sum()
#respondants possibly not knowing, or caring..might be a good way to test?

356

In [10]:
#finding nulls in other columns...
df.isnull().sum()

zodiac       356
born          72
race          54
ethnic       438
age          333
            ... 
class         14
satfin        16
quallife     400
partners    1719
partnrs5    1718
Length: 72, dtype: int64

In [11]:
#checking out the possible 'why' of nulls..
df.partnrs5.value_counts()
#possibly due to respondants not wanting to say....

1 partner                   1416
no partners                  336
2 partners                   151
5-10 partners                109
3 partners                    83
1 or more, (unspecified)      83
4 partners                    64
11-20 partners                40
21-100 partners               25
more than 100 partners         7
Name: partnrs5, dtype: int64

In [12]:
df.quallife.value_counts()

very good    1449
good         1243
excellent     477
fair          399
poor           64
Name: quallife, dtype: int64

In [13]:
df.ethnic.value_counts()

england and wales    599
germany              507
ireland              380
italy                186
mexico               179
                    ... 
504.0                  1
405.0                  1
601.0                  1
408.0                  1
509.0                  1
Name: ethnic, Length: 86, dtype: int64

In [14]:
df.age.value_counts()

67.0    89
33.0    82
59.0    81
57.0    79
63.0    77
        ..
19.0    14
86.0    14
87.0     5
88.0     5
18.0     4
Name: age, Length: 72, dtype: int64

### TAKEAWAY: I was considering possibly filling out 'unsure' and 'no response' for certain nulls, but I may just take them out by percentage basis as each nulled column is different and probably for different reasons

In [15]:
#First, I will just drop the nulls in the target variable column, as these won't help determine classifying the target
df = df.dropna(axis=0, subset=['zodiac'])
df.zodiac.isnull().sum()

0

In [16]:
#Because the partners and partnrs5 columns are missing 1/3 of their responses, and could be possibly due to age/demographic, I will fill these with 'no_response'
#df.partners = df.partners.fillna('no_response')
#ok. I need to set a category up for these columns as this code came out with a Categorical Error

In [17]:
df.partners.value_counts()

1 partner                   1427
no partners                  560
2 partners                    82
3 partners                    42
5-10 partners                 40
4 partners                    27
1 or more, (unspecified)       5
11-20 partners                 4
more than 100 partners         4
21-100 partners                2
Name: partners, dtype: int64

In [18]:
df.partners.dtypes

CategoricalDtype(categories=['no partners', '1 partner', '2 partners', '3 partners',
                  '4 partners', '5-10 partners', '11-20 partners',
                  '21-100 partners', 'more than 100 partners',
                  '1 or more, (unspecified)'],
, ordered=True)

In [19]:
#changing the dtype to an object so that I can fill the nulls
def partner_type(df):
    df['partners'] = df['partners'].astype(object)
    df['partners'] = df['partners'].fillna('no response')
    return df

In [20]:
#filling nulls
df= partner_type(df)

In [21]:
#same thing for partnrs5
def partner_five(df):
    df['partnrs5'] = df['partnrs5'].astype(object)
    df['partnrs5'] = df['partnrs5'].fillna('no response')
    return df

In [22]:
df = partner_five(df)
df.head()

Unnamed: 0,zodiac,born,race,ethnic,age,sex,sexornt,marital,martype,paocc10,...,socbar,socrel,socommun,socfrend,satsoc,class,satfin,quallife,partners,partnrs5
0,pisces,yes,white,italy,65.0,female,heterosexual or straight,married,,"dredge, excavating, and loading machine operators",...,,,,,fair,middle class,more or less satisfied,good,1 partner,1 partner
1,pisces,yes,white,england and wales,60.0,male,,divorced,,chief executives,...,several times a month,several times a year,once or twice a week,several times a month,very good,upper class,pretty well satisfied,excellent,no response,no response
5,aries,yes,black,299.00,33.0,female,,never married,,retail salespersons,...,about once a month,about once a month,about once a month,about once a month,good,middle class,pretty well satisfied,good,no response,no response
6,scorpio,no,white,505.00,20.0,male,heterosexual or straight,never married,,"electrical, electronics, and electromechanical...",...,never,almost daily,once or twice a week,about once a month,good,middle class,not satisfied at all,very good,1 partner,1 partner
8,pisces,yes,white,poland,76.0,male,heterosexual or straight,married,marriage between a man and a woman,,...,about once a year,about once a year,about once a month,about once a month,good,working class,not satisfied at all,good,1 partner,1 partner


In [23]:
df.partners.dtype

dtype('O')

In [24]:
df.martype.value_counts()

marriage between a man and a woman                73
not sure                                          17
marriage between two people of the same gender     1
Name: martype, dtype: int64

In [25]:
df.sexornt.value_counts()

heterosexual or straight       1970
bisexual                         96
gay, lesbian, or homosexual      74
Name: sexornt, dtype: int64

In [26]:
df.dtypes

zodiac      category
born        category
race        category
ethnic      category
age         category
              ...   
class       category
satfin      category
quallife    category
partners      object
partnrs5      object
Length: 72, dtype: object

In [27]:
df.shape

(3676, 72)

In [28]:
#changing all the dtypes to objects
df = df.astype(object)

In [29]:
df.dtypes

zodiac      object
born        object
race        object
ethnic      object
age         object
             ...  
class       object
satfin      object
quallife    object
partners    object
partnrs5    object
Length: 72, dtype: object

In [30]:
df.isnull().sum()

zodiac        0
born         18
race         47
ethnic      397
age          29
           ... 
class         7
satfin       13
quallife    256
partners      0
partnrs5      0
Length: 72, dtype: int64

In [31]:
#the rest of nulls will become unknowns...
df=df.fillna('unknown')

In [32]:
df.isnull().sum()

zodiac      0
born        0
race        0
ethnic      0
age         0
           ..
class       0
satfin      0
quallife    0
partners    0
partnrs5    0
Length: 72, dtype: int64

### NOTES: Nulls are done! ✅ Let's Split...

_______________________________________________

## Splitting Data into Train, Validate, Test:

In [33]:
#split the data into train and test
def split(df):
    train_and_validate, test = train_test_split(df, random_state=13, test_size=.15)
    train, validate = train_test_split(train_and_validate, random_state=13, test_size=.2)

    print('Train: %d rows, %d cols' % train.shape)
    print('Validate: %d rows, %d cols' % validate.shape)
    print('Test: %d rows, %d cols' % test.shape)

    return train, validate, test

In [34]:
train, validate, test = split(df)

Train: 2499 rows, 72 cols
Validate: 625 rows, 72 cols
Test: 552 rows, 72 cols


In [35]:
train.head()

Unnamed: 0,zodiac,born,race,ethnic,age,sex,sexornt,marital,martype,paocc10,...,socbar,socrel,socommun,socfrend,satsoc,class,satfin,quallife,partners,partnrs5
238,libra,yes,white,germany,42.0,female,heterosexual or straight,married,unknown,first-line supervisors of office and administr...,...,about once a year,several times a year,never,about once a year,fair,middle class,more or less satisfied,good,1 partner,1 partner
342,libra,yes,white,germany,51.0,male,unknown,married,unknown,probation officers and correctional treatment ...,...,several times a year,several times a month,several times a month,about once a month,excellent,upper class,more or less satisfied,very good,no response,no response
786,sagittarius,yes,black,unknown,45.0,female,heterosexual or straight,never married,unknown,"inspectors, testers, sorters, samplers, and we...",...,never,about once a month,never,never,very good,lower class,not satisfied at all,fair,2 partners,2 partners
1767,libra,yes,black,299.00,70.0,female,heterosexual or straight,never married,unknown,chefs and head cooks,...,several times a year,once or twice a week,once or twice a week,several times a month,very good,middle class,more or less satisfied,very good,1 partner,1 partner
509,virgo,yes,black,299.00,39.0,female,heterosexual or straight,never married,unknown,unknown,...,unknown,unknown,unknown,unknown,very good,working class,not satisfied at all,very good,1 partner,1 partner


In [36]:
#looking at zodiac count (in order) by function:
def zodiac_order(df):
    print('Aries:',(df['zodiac']== 'aries').sum())
    print('Taurus:',(df['zodiac']== 'taurus').sum())
    print('Gemini:',(df['zodiac']== 'gemini').sum())
    print('Cancer:',(df['zodiac']== 'cancer').sum())
    print('Leo:',(df['zodiac']== 'leo').sum())
    print('Virgo:',(df['zodiac']== 'virgo').sum())
    print('Libra:',(df['zodiac']== 'libra').sum())
    print('Scorpio:',(df['zodiac']== 'scorpio').sum())
    print('Sagittarius:',(df['zodiac']== 'sagittarius').sum())
    print('Capricorn:',(df['zodiac']== 'capricorn').sum())
    print('Aquarius:',(df['zodiac']== 'aquarius').sum())
    print('Pisces:',(df['zodiac']== 'pisces').sum())
    return df

In [37]:
zodiac_order(train)

Aries: 161
Taurus: 188
Gemini: 183
Cancer: 188
Leo: 211
Virgo: 212
Libra: 204
Scorpio: 219
Sagittarius: 228
Capricorn: 271
Aquarius: 225
Pisces: 209


Unnamed: 0,zodiac,born,race,ethnic,age,sex,sexornt,marital,martype,paocc10,...,socbar,socrel,socommun,socfrend,satsoc,class,satfin,quallife,partners,partnrs5
238,libra,yes,white,germany,42.00,female,heterosexual or straight,married,unknown,first-line supervisors of office and administr...,...,about once a year,several times a year,never,about once a year,fair,middle class,more or less satisfied,good,1 partner,1 partner
342,libra,yes,white,germany,51.00,male,unknown,married,unknown,probation officers and correctional treatment ...,...,several times a year,several times a month,several times a month,about once a month,excellent,upper class,more or less satisfied,very good,no response,no response
786,sagittarius,yes,black,unknown,45.00,female,heterosexual or straight,never married,unknown,"inspectors, testers, sorters, samplers, and we...",...,never,about once a month,never,never,very good,lower class,not satisfied at all,fair,2 partners,2 partners
1767,libra,yes,black,299.00,70.00,female,heterosexual or straight,never married,unknown,chefs and head cooks,...,several times a year,once or twice a week,once or twice a week,several times a month,very good,middle class,more or less satisfied,very good,1 partner,1 partner
509,virgo,yes,black,299.00,39.00,female,heterosexual or straight,never married,unknown,unknown,...,unknown,unknown,unknown,unknown,very good,working class,not satisfied at all,very good,1 partner,1 partner
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2963,capricorn,no,other,302.00,32.00,male,unknown,married,unknown,first-line supervisors of retail sales workers,...,never,several times a month,several times a month,several times a year,very good,working class,more or less satisfied,good,no response,no response
1055,libra,yes,white,ireland,42.00,male,heterosexual or straight,never married,unknown,education administrators,...,unknown,unknown,unknown,unknown,fair,working class,not satisfied at all,fair,no partners,1 partner
151,aries,yes,white,unknown,62.00,male,unknown,married,unknown,lawyers,...,never,once or twice a week,about once a month,about once a month,good,middle class,more or less satisfied,very good,no response,no response
2317,gemini,yes,other,japan,56.00,female,heterosexual or straight,married,unknown,precision instrument and equipment repairers,...,unknown,unknown,unknown,unknown,excellent,upper class,pretty well satisfied,excellent,1 partner,1 partner
